scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28320B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Identifying Inaccurate Descriptions in LLM-generated Code Comments via Test Execution",
      6     "authors": [
      7       "Sungmin Kang",
      8       "Louis Milliken",
      9       "Shin Yoo"
     10     ],
     11     "year": 2024,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2406.14836",
     14     "doi": "10.48550/arXiv.2406.14836"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All major abstract claims (GPT-4 ~20% inaccuracy rate, existing techniques fail, document testing shows robust statistical relationship) are directly supported by Table 1, Figures 1 and 4, and the 540-comment manual evaluation.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "RQ2 provides an explicit ablation study (Figure 7) showing each pipeline component incrementally contributes to performance, supporting the causal claim that each element improves discrimination ability.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Section 7.1 explicitly bounds results: 'the experimental results we present were done on Java code from the widely-used Defects4J benchmark; further research is required to tell whether these principles would work for other languages and projects.'",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "CodeBERT's seemingly high ROC-AUC is explained as spurious (opposite direction on StarCoder comments); the dominance of failing tests (high w) over passing tests is explained by the labeling criterion that any single inaccuracy makes the whole comment inaccurate.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper carefully distinguishes the proxy measure (correctness estimator / test pass rate) from the target outcome (binary factual accuracy from manual labeling), and evaluates the relationship between the two rather than conflating them.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 7 'Discussion' contains a dedicated subsection 7.1 'Threats to Validity' addressing both internal and external threats with specific examples.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Specific threats named: LLM output stochasticity (addressed with 5 repeated runs), possible memorization of Defects4J source code by GPT models, and restriction to Java and one benchmark for external validity.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Scope is explicitly bounded to behavioral comment inaccuracies (not intent/reference errors), to Java methods from Defects4J, and to method-level comments — with specific callouts in Section 7 and throughout RQ4.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding source is mentioned anywhere in the paper.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors are identified as affiliated with KAIST, Daejeon, South Korea on the title page.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No funding is disclosed, so independence cannot be assessed.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "There is no competing interests or financial disclosure statement anywhere in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "'Factual accuracy' is explicitly defined as 'a comment contains no sentences or phrases that describe the behavior or intent of the code falsely'; 'document testing' is formally defined with a Bayesian mathematical model in Section 4.1.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Three contributions are bulleted explicitly: (1) 540-comment manual evaluation, (2) evaluation of 9 existing approaches demonstrating they all fail, (3) the document testing concept and Java implementation.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 8 engages substantively with code-comment consistency detection (Deep-JIT, DocChecker, @tComment), documentation generation (seq2seq approaches, LLM-based), and hallucination detection — explaining how this work relates to and differs from each.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "No code repository URL is provided; only supplementary material containing prompts is mentioned, with no explicit open-source code release.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Defects4J is a publicly available standard benchmark; the 180 Java methods are drawn from it without modification.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No requirements file, Dockerfile, or dependency specification is provided; only that Defects4J CLI was used for test execution.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "The pipeline is described conceptually (Figure 3) and prompts are in supplementary, but no step-by-step reproduction instructions sufficient to replicate the experiments are provided.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "95% confidence intervals are explicitly shown in Figures 4a, 5, and 7 for all repeated-run metrics.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Welch's t-test and Point-Biserial correlation are used throughout Sections 2 and 6 to assess statistical significance for both baselines and the proposed method.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "ROC-AUC (0.67 for proposed method vs. 0.5 chance) and Average Precision values are reported for all approaches in Figure 5.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The 141-comment evaluation set is the result of filtering for unambiguous labels from 180 samples, with no power analysis or explicit justification for adequacy.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "Experiments are repeated 5 times and 95% confidence intervals are plotted in Figures 4a, 5, and 7 to reflect variance from LLM stochasticity.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Nine baselines are evaluated: DocChecker, Deep-JIT, GPT-3-NoCoT, GPT-3-CoT, BLEU, SentenceBERT, CodeT5, CodeBERT, and CID (Table 1 and Figure 5).",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Baselines include 2024 papers (DocChecker, Li and Shin 2024, CID 2024) alongside established tools, representing a current comparison set.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "RQ2 is an explicit ablation with six configurations: comment/signature only, +class name, +constructor, no two-stage prompt, EvoSuite tests, and full default setting (Figure 7).",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "ROC-AUC, Average Precision, test pass rate, Welch's t-test p-values, and Point-Biserial correlation are all reported.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "The manual comment labeling in Section 2.1 is for dataset construction, not evaluation of system outputs; RQ4 is author-performed qualitative example analysis, not a formal user study.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "The 141 unambiguously labeled GPT-3 comments serve as a defined evaluation set; since no model is trained, there is no leakage risk and the evaluation set is defined prior to the main experiments.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Figure 4a breaks down test pass rates by document accuracy category (accurate, behavior-inaccurate, all-inaccurate); Figure 2 provides a categorical taxonomy of four error types with counts.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Section 6.4 (RQ4) explicitly presents two failure cases with concrete examples: environment-dependent test failures (Figure 10) and LLM hallucination of test properties (Figure 11).",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "The central contribution of Section 2.2 is a negative result: all nine existing baselines show no statistically significant relationship with comment accuracy, reported with full p-values in Table 1.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "GPT-3 is specified as 'gpt-3.5-turbo-0125'; GPT-4 and StarCoder are cited with specific technical report references. The main experimental model is precisely versioned.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "The paper explicitly states 'The specific prompt is provided in our supplementary material' for both the comment generation and document testing prompts.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "No temperature, top-p, max tokens, or other generation hyperparameters are reported for any LLM calls in the paper.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Section 4.2 and Figure 3 describe the full pipeline in detail: information retrieval from repository, two-stage prompting (property extraction then test generation per property), and test injection and execution using Defects4J CLI.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Section 2.1 documents sampling criteria (public methods, fixed-file methods, longest comments selected from Defects4J); Section 4.2.1 documents test example selection heuristics for the prompt.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "No repository or URL for the 540 labeled comments or experimental results is provided; only prompts are noted as being in supplementary material.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section 2.1 describes sampling in detail: 180 public methods from Defects4J bug-fixing files, longest comments selected, labeled by first author with a subset independently labeled by the second author achieving 87% agreement.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants were recruited; the study uses a standard benchmark (Defects4J).",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The full pipeline from Defects4J method sampling through LLM comment generation, manual labeling, document testing, and test execution is described across Sections 2 and 4.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Section 7.1 acknowledges 'the training data of the GPT family of LLMs is unknown' — no training cutoff date is stated.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": true,
    300           "justification": "Section 7.1 explicitly discusses risk that GPT models may have seen Defects4J source code during training, arguing that GPT's failure at direct comment accuracy prediction suggests this did not create an unfair advantage.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": true,
    306           "justification": "The paper acknowledges Defects4J predates GPT training and addresses potential contamination in Section 7.1 with the argument that contamination would have enabled direct accuracy prediction — which was not observed.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human subjects study was conducted.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human subjects study was conducted.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human subjects study was conducted.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human subjects study was conducted.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human subjects study was conducted.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human subjects study was conducted.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human subjects study was conducted.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "No API costs, token counts, or latency measurements are reported for any of the GPT-3 calls used in the document testing pipeline.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "No total computational budget or resource usage is stated.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Even the best-performing LLM (GPT-4) generates demonstrably inaccurate statements in roughly one fifth of its method-level Java comments",
    373       "evidence": "Manual evaluation of 540 LLM-generated comments (180 methods × 3 LLMs) by the first author with 87% inter-rater agreement; Figure 1 shows ~20% inaccuracy rate for GPT-4, ~33% for GPT-3, ~50% for StarCoder",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "None of the nine evaluated existing code-comment consistency and similarity techniques show a statistically significant relationship with factual comment accuracy",
    378       "evidence": "Table 1 reports Welch's t-test p-values all >0.15 and Point-Biserial correlations near zero for all nine baselines including contemporary 2024 techniques",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Document testing's correctness estimator has a strong statistical relationship with actual comment accuracy (p < 10^-9, AUC = 0.67)",
    383       "evidence": "Figure 4b ROC curve, point-biserial correlation (p < 10^-11), and Welch's t-test (p < 10^-9) across 5 repeated runs on 141 labeled comments",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Failing tests are more predictive of comment inaccuracy than passing tests (best performance at w > 10 in the correctness estimator)",
    388       "evidence": "Figure 8 plots ROC-AUC across w values showing peak at w > 10; explained by labeling criterion that any single inaccuracy makes an entire comment inaccurate",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Each component of the prompt (class name, constructors, example tests, two-stage prompting) incrementally improves both test generation rate and discrimination ability",
    393       "evidence": "Figure 7 ablation study showing monotonically increasing compilable test rates and ROC-AUC from comment/signature only through full default configuration",
    394       "supported": "moderate"
    395     }
    396   ],
    397   "methodology_tags": [
    398     "benchmark-eval",
    399     "observational"
    400   ],
    401   "key_findings": "Manual evaluation of 540 LLM-generated Java comments reveals inaccuracy rates of 20–50% across three LLMs (GPT-4, GPT-3, StarCoder), demonstrating that LLMs frequently hallucinate or mischaracterize code behavior in comments. All nine evaluated existing code-comment consistency detection and similarity techniques fail to distinguish accurate from inaccurate comments (all p > 0.15). The proposed 'document testing' approach — prompting an LLM to extract testable properties from a comment, generate tests for each property, and execute them — achieves AUC=0.67 (p < 10^-9) as the first technique with a meaningful statistical relationship to factual accuracy. A taxonomy of four error types (hallucinating intent, hallucinating references, lacking code context, code mischaracterization) reveals that the two most common types are potentially detectable via test execution, while intent and reference hallucinations remain hard to automate.",
    402   "red_flags": [
    403     {
    404       "flag": "Small evaluation set",
    405       "detail": "The main document testing experiment uses only 141 unambiguously labeled GPT-3 comments with no power analysis to assess whether this is sufficient for reliable AUC estimates."
    406     },
    407     {
    408       "flag": "Modest discriminative performance",
    409       "detail": "AUC of 0.67 is statistically significant but practically limited; the paper does not quantify the operational cost of false positives/negatives or what AUC would be needed for deployment."
    410     },
    411     {
    412       "flag": "Single language and benchmark",
    413       "detail": "All experiments use Java methods from Defects4J only, with no cross-language or cross-project validation of either the inaccuracy rates or the document testing approach."
    414     },
    415     {
    416       "flag": "Same LLM for generation and testing",
    417       "detail": "GPT-3 is used both to generate the comments being evaluated and to generate the tests for document testing; the same model evaluating its own outputs may create systematic biases not captured in this setup."
    418     },
    419     {
    420       "flag": "No code release",
    421       "detail": "Only prompts are promised in supplementary material; the full implementation pipeline and labeled dataset are not released, substantially limiting reproducibility."
    422     },
    423     {
    424       "flag": "Moderate inter-rater agreement reported as binary",
    425       "detail": "87% agreement between annotators for ground truth labels is reported without Cohen's kappa or analysis of which comment types had lower agreement, potentially inflating confidence in the gold labels."
    426     }
    427   ],
    428   "cited_papers": [
    429     {
    430       "title": "Deep Just-in-Time Inconsistency Detection Between Comments and Source Code",
    431       "relevance": "Key baseline for code-comment consistency detection; directly compared against as a state-of-the-art technique"
    432     },
    433     {
    434       "title": "DocChecker: Bootstrapping Code Large Language Model for Detecting and Resolving Code-Comment Inconsistencies",
    435       "relevance": "Contemporary 2024 baseline for code-comment consistency, directly compared against"
    436     },
    437     {
    438       "title": "Large Language Models are Few-Shot Summarizers: Multi-Intent Comment Generation via In-Context Learning",
    439       "relevance": "Represents state-of-the-art LLM-based comment generation that motivates the need for accuracy evaluation"
    440     },
    441     {
    442       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs",
    443       "relevance": "Benchmark used for all experiments; source of the 180 Java methods evaluated"
    444     },
    445     {
    446       "title": "StarCoder: may the source be with you!",
    447       "relevance": "One of the three LLMs evaluated for comment generation accuracy"
    448     },
    449     {
    450       "title": "@tComment: Testing Javadoc Comments to Detect Comment-Code Inconsistencies",
    451       "relevance": "Most directly related prior work: uses Randoop random testing to extract and verify comment invariants; document testing extends this concept to LLMs"
    452     },
    453     {
    454       "title": "ChatGPT Incorrectness Detection in Software Reviews",
    455       "relevance": "LLM-based inconsistency detection baseline (CID) evaluated and found ineffective for comment accuracy detection"
    456     },
    457     {
    458       "title": "Large language models are few-shot testers: Exploring LLM-based general bug reproduction",
    459       "relevance": "Prior work by same first author on LLM-based test generation; directly informs the document testing pipeline design"
    460     }
    461   ],
    462   "engagement_factors": {
    463     "practical_relevance": {
    464       "score": 2,
    465       "justification": "Document testing is directly applicable to code review tools and IDE plugins for flagging potentially inaccurate auto-generated comments, though 0.67 AUC limits immediate deployment."
    466     },
    467     "surprise_contrarian": {
    468       "score": 2,
    469       "justification": "The finding that all nine existing code-comment consistency tools completely fail on LLM-generated comments contradicts the assumption that existing tools would generalize to the LLM-generated content regime."
    470     },
    471     "fear_safety": {
    472       "score": 1,
    473       "justification": "Inaccurate code comments can introduce bugs as developers act on misleading documentation, but the safety stakes are moderate compared to AI alignment or security research."
    474     },
    475     "drama_conflict": {
    476       "score": 1,
    477       "justification": "The paper directly contradicts prior findings suggesting SentenceBERT similarity correlates with comment accuracy, but this is a contained technical disagreement."
    478     },
    479     "demo_ability": {
    480       "score": 2,
    481       "justification": "The concept can be demonstrated interactively on any Java codebase with a test suite, and the examples in Figures 9–11 make the approach concretely understandable."
    482     },
    483     "brand_recognition": {
    484       "score": 0,
    485       "justification": "Authors are from KAIST with no affiliation to a major AI lab or industry product; no well-known benchmark or dataset is introduced."
    486     }
    487   },
    488   "hn_data": {
    489     "threads": [
    490       {
    491         "hn_id": "42969750",
    492         "title": "HippoRAG: Neurobiologically Inspired Long-Term Memory for LLMs (2024)",
    493         "points": 65,
    494         "comments": 4,
    495         "url": "https://news.ycombinator.com/item?id=42969750",
    496         "created_at": "2025-02-07T05:34:59Z"
    497       },
    498       {
    499         "hn_id": "44351798",
    500         "title": "Tensor Manipulation Unit (TMU): Reconfigurable, Near-Memory, High-Throughput AI",
    501         "points": 58,
    502         "comments": 13,
    503         "url": "https://news.ycombinator.com/item?id=44351798",
    504         "created_at": "2025-06-23T01:43:11Z"
    505       },
    506       {
    507         "hn_id": "40471638",
    508         "title": "Not All Language Model Features Are Linear",
    509         "points": 9,
    510         "comments": 7,
    511         "url": "https://news.ycombinator.com/item?id=40471638",
    512         "created_at": "2024-05-25T00:18:52Z"
    513       },
    514       {
    515         "hn_id": "42708598",
    516         "title": "HippoRAG: Neurobiologically Inspired Long-Term Memory for Large Language Models",
    517         "points": 3,
    518         "comments": 0,
    519         "url": "https://news.ycombinator.com/item?id=42708598",
    520         "created_at": "2025-01-15T08:18:03Z"
    521       },
    522       {
    523         "hn_id": "42389836",
    524         "title": "From Explicit CoT to Implicit CoT: Learning to Internalize CoT Step by Step",
    525         "points": 2,
    526         "comments": 0,
    527         "url": "https://news.ycombinator.com/item?id=42389836",
    528         "created_at": "2024-12-11T16:57:02Z"
    529       },
    530       {
    531         "hn_id": "41386088",
    532         "title": "Diffusion Models Are Real-Time Game Engines",
    533         "points": 2,
    534         "comments": 0,
    535         "url": "https://news.ycombinator.com/item?id=41386088",
    536         "created_at": "2024-08-29T00:49:38Z"
    537       },
    538       {
    539         "hn_id": "41139190",
    540         "title": "Ranking Large Language Models Without Ground Truth",
    541         "points": 2,
    542         "comments": 0,
    543         "url": "https://news.ycombinator.com/item?id=41139190",
    544         "created_at": "2024-08-02T14:44:52Z"
    545       },
    546       {
    547         "hn_id": "41107497",
    548         "title": "From Explicit Cot to Implicit Cot: Learning to Internalize Cot Step by Step",
    549         "points": 2,
    550         "comments": 0,
    551         "url": "https://news.ycombinator.com/item?id=41107497",
    552         "created_at": "2024-07-30T09:36:30Z"
    553       },
    554       {
    555         "hn_id": "39580207",
    556         "title": "Comparing Inferential Strategies of Humans and LLMs in Deductive Reasoning",
    557         "points": 2,
    558         "comments": 0,
    559         "url": "https://news.ycombinator.com/item?id=39580207",
    560         "created_at": "2024-03-03T11:34:52Z"
    561       },
    562       {
    563         "hn_id": "41486754",
    564         "title": "Diffusion models are real-time game engines",
    565         "points": 1,
    566         "comments": 0,
    567         "url": "https://news.ycombinator.com/item?id=41486754",
    568         "created_at": "2024-09-09T09:20:00Z"
    569       }
    570     ],
    571     "top_points": 65,
    572     "total_points": 146,
    573     "total_comments": 24
    574   }
    575 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs