scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25572B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Large Language Models for Fault Localization: An Empirical Study",
      6     "authors": [
      7       "YingJian Xiao",
      8       "Rongqun Hu",
      9       "Weiwei Gong",
     10       "Hongwei Li",
     11       "AnQuan Jie"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2510.20521",
     16     "doi": "10.48550/arXiv.2510.20521"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All abstract claims are supported by experimental results in Tables 3–8: Gemini outperforms others, bug context helps, few-shot shows diminishing returns, CoT depends on model ability.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims like 'bug report context improves performance' are justified through controlled empirical comparisons (with/without context). Table 4 clearly shows the effect across all models.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Section 5 (Threats to External Validity) explicitly bounds conclusions to Java, two specific datasets, four models, and statement-level localization. Claims appropriately scoped.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Paper lacks discussion of *why* CoT hurts GPT-4.1 on HumanEval but helps on Defects4J, or what mechanisms underlie model differences. Findings are reported without mechanistic explanation.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Paper measures statement-level fault localization but extrapolates implications for program repair effectiveness without discussing the gap between locating bugs and successfully fixing them.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 5 'THREATS TO VALIDITY' comprehensively covers internal, construct, and external validity threats with three subsections.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Threats are concrete: prompt phrasing effects, 'perfect match' strictness overestimating errors, potential data contamination with Gemini's Jan 2025 cutoff, and Java-only generalization limits.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Explicit boundaries: Java only, statement-level localization, two datasets, 13 trials per condition, synthetic vs real-world comparison. Scope clearly delineated.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding or grant information provided anywhere in the paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations with Nanchang Institute of Technology and Jiangxi Normal University are listed, though not affiliations with evaluated product companies.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding disclosed; criterion does not apply.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial conflicts statement present.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "Terms like 'fault localization,' 'few-shot learning,' and 'chain-of-thought' are used without formal definitions; paper assumes domain expertise.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 1 explicitly states three contributions: (1) empirical evaluation on two datasets, (2) exploration of prompting strategies, (3) time/cost analysis. Intent is unambiguous.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 covers traditional methods, deep learning approaches, LLMs in fault localization, and prompt engineering. Engagement is broad though somewhat surface-level.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No code, scripts, or prompts released. Paper describes experiments but provides no reproducible implementation or prompt templates.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Experiments use two public benchmark datasets: HumanEval-Java and Defects4J v1.2.0, both available for download.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No environment specifications provided: no Python version, library versions, dependency lists, API configuration details, or reproducibility instructions.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Paper describes experimental design and results but does not provide step-by-step instructions to reproduce. Actual prompts are hidden behind 'standardized templates.'",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "Tables 3–8 report point estimates only with no confidence intervals, error bars, or variance measures. Pass@k computed over 13 trials but variance not reported.",
    150         "source": "haiku"
    151       },
    152       "significance_tests": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "No p-values or significance tests applied to performance differences in main results. Scott-Knott ESD test mentioned for time analysis but results not shown in paper.",
    156         "source": "haiku"
    157       },
    158       "effect_sizes_reported": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "Effect sizes not formally reported. Percentage-point differences shown (e.g., 65.03% vs 46.63%) but not quantified as standardized effect sizes.",
    162         "source": "haiku"
    163       },
    164       "sample_size_justified": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "Use of 13 trials per condition not justified. No power analysis or sample-size rationale provided.",
    168         "source": "haiku"
    169       },
    170       "variance_reported": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "Main results (Tables 3–8) show single point estimates with no standard deviation, variance, or confidence intervals despite running 13 trials.",
    174         "source": "haiku"
    175       },
    176       "evaluation_design": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Study includes model comparisons, ablations across zero-shot/few-shot/CoT, five performance metrics, plus time and cost analysis across two datasets.",
    180         "source": "haiku"
    181       },
    182       "baselines_included": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Four different models compared as baselines; each prompting strategy (standard, few-shot variants, CoT) serves as ablation against the others.",
    186         "source": "haiku"
    187       },
    188       "baselines_contemporary": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "All models released 2024–2025 (GPT-4.1 mini Apr 2025, Qwen Nov 2024, DeepSeek Dec 2024, Gemini Apr 2025). Baselines are current.",
    192         "source": "haiku"
    193       },
    194       "ablation_study": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Systematic ablations: zero-shot, one/two/three-shot, and CoT. Each strategy tested independently to isolate effect.",
    198         "source": "haiku"
    199       },
    200       "multiple_metrics": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Five performance metrics (Top@5, Top@10, Pass@1, Pass@5, Pass@10) plus time and cost dimensions provide multi-faceted evaluation.",
    204         "source": "haiku"
    205       },
    206       "human_evaluation": {
    207         "applies": false,
    208         "answer": false,
    209         "justification": "Automated benchmark evaluation; human evaluation not applicable for this task. Fault localization correctness is objectively verifiable.",
    210         "source": "haiku"
    211       },
    212       "held_out_test_set": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Both HumanEval-Java and Defects4J are established benchmark datasets with fixed test sets by design.",
    216         "source": "haiku"
    217       },
    218       "per_category_breakdown": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "Results grouped by dataset and model but not by bug complexity, type, or category (e.g., logic errors vs type errors). No granular failure analysis.",
    222         "source": "haiku"
    223       },
    224       "failure_cases_discussed": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "Paper does not analyze specific failure modes, provide examples of mislocalizations, or discuss what types of bugs each model struggles with.",
    228         "source": "haiku"
    229       },
    230       "negative_results_reported": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "Negative findings explicitly reported: CoT hurts GPT-4.1 on HumanEval (Table 7), three-shot sometimes underperforms two-shot (Tables 5–6), some models degrade on Defects4J.",
    234         "source": "haiku"
    235       },
    236       "setup_transparency": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "Model versions and knowledge cutoff dates specified in Table 2, but actual prompts not provided. Temperature, top-p, max_tokens, and other inference hyperparameters not disclosed.",
    240         "source": "haiku"
    241       },
    242       "model_versions_specified": {
    243         "applies": true,
    244         "answer": true,
    245         "justification": "Exact model versions listed: GPT-4.1 mini, Qwen2.5-coder-32b-instruct, Gemini-2.5-flash, DeepSeek-V3. Table 2 includes knowledge cutoff dates.",
    246         "source": "haiku"
    247       },
    248       "prompts_provided": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "Paper mentions 'standardized templates' and 'prompt design' but never includes actual prompts used. Reproduction impossible without them.",
    252         "source": "haiku"
    253       },
    254       "hyperparameters_reported": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "No sampling parameters (temperature, top-p, top-k), max_tokens limits, or API configuration details reported.",
    258         "source": "haiku"
    259       },
    260       "scaffolding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "Direct API calls to LLMs; no agentic scaffolding (planning, memory, tool use) employed. Not applicable.",
    264         "source": "haiku"
    265       },
    266       "data_preprocessing_documented": {
    267         "applies": true,
    268         "answer": false,
    269         "justification": "HumanEval-Java and Defects4J properties described, but specific preprocessing, filtering, or feature extraction steps not documented.",
    270         "source": "haiku"
    271       },
    272       "data_integrity": {
    273         "applies": true,
    274         "answer": true,
    275         "justification": "Public benchmarks used; collection procedures for HumanEval-Java (synthetic bugs injected) and Defects4J (real projects) briefly described.",
    276         "source": "haiku"
    277       },
    278       "raw_data_available": {
    279         "applies": true,
    280         "answer": true,
    281         "justification": "Both datasets publicly available. HumanEval-Java and Defects4J can be downloaded independently.",
    282         "source": "haiku"
    283       },
    284       "data_collection_described": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "High-level description (164 bugs with JUnit tests, 395 bugs from 6 projects) but insufficient detail on selection criteria, curation process, or representativeness.",
    288         "source": "haiku"
    289       },
    290       "recruitment_methods_described": {
    291         "applies": false,
    292         "answer": false,
    293         "justification": "No human participants; criterion does not apply.",
    294         "source": "haiku"
    295       },
    296       "data_pipeline_documented": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "Pipeline from benchmark to experiment not fully documented: how bugs were selected, how test cases were validated, how output was parsed and matched.",
    300         "source": "haiku"
    301       },
    302       "contamination": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "Training cutoff dates explicitly stated in Table 2. Train-test overlap discussed in Section 5 (Threats to Construct Validity); potential contamination acknowledged but not fully ruled out for Gemini (Jan 2025 cutoff).",
    306         "source": "haiku"
    307       },
    308       "training_cutoff_stated": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Table 2 lists knowledge cutoff for each model: GPT-4.1 mini (2024-06), Qwen (2023-10), Gemini-2.5-flash (2025-01), DeepSeek-V3 (2024-07).",
    312         "source": "haiku"
    313       },
    314       "train_test_overlap_discussed": {
    315         "applies": true,
    316         "answer": true,
    317         "justification": "Construct validity section acknowledges 'data contamination cannot be completely ruled out' despite using newer datasets and considering cutoff dates.",
    318         "source": "haiku"
    319       },
    320       "benchmark_contamination_addressed": {
    321         "applies": true,
    322         "answer": true,
    323         "justification": "Paper explicitly notes that HumanEval-Java was created to avoid contamination, and acknowledges potential risk for other models and datasets. Limitation acknowledged.",
    324         "source": "haiku"
    325       },
    326       "human_studies": {
    327         "applies": false,
    328         "answer": false,
    329         "justification": "No human participants; all questions in this category do not apply.",
    330         "source": "haiku"
    331       },
    332       "cost_and_practicality": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "Section 4.4.2 provides detailed API cost analysis with explicit dollar/yuan costs. Section 4.4.1 reports time overhead with means and max values.",
    336         "source": "haiku"
    337       },
    338       "inference_cost_reported": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "Table 9–10 show per-call costs in USD and CNY for each model on each dataset. Costs range $0.024–$1.917 per call, thoroughly documented.",
    342         "source": "haiku"
    343       },
    344       "compute_budget_stated": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "Individual call times and costs reported, but total computational budget (aggregate tokens, cumulative cost, total compute hours) not calculated or summed.",
    348         "source": "haiku"
    349       }
    350     }
    351   },
    352   "claims": [
    353     {
    354       "claim": "Gemini-2.5-flash outperforms other models on fault localization across both HumanEval-Java and Defects4J",
    355       "evidence": "Tables 3–4: Gemini achieves Top@5=65.03% on HumanEval-Java (vs GPT 50%, Qwen 46.6%, DeepSeek 52.1%) and Top@5=23.67% on Defects4J with bug context (vs GPT 15.15%, Qwen 13.75%, DeepSeek 11.56%)",
    356       "supported": "strong"
    357     },
    358     {
    359       "claim": "Providing bug report context significantly improves fault localization performance for all models",
    360       "evidence": "Table 4 shows dramatic gains on Defects4J: GPT-4.1 mini improves from 3.90% to 15.15% Top@5; Qwen from 4.90% to 13.75%; Gemini from 6.08% to 23.67%; DeepSeek from 2.10% to 11.56%",
    361       "supported": "strong"
    362     },
    363     {
    364       "claim": "Few-shot learning improves performance but exhibits clear diminishing marginal returns beyond two examples",
    365       "evidence": "Tables 5–6 show two-shot typically peaks, three-shot often regresses. E.g., Qwen on HumanEval: one-shot Pass@5=49.09%, two-shot=47.61%, three-shot=47.28%. Similar pattern across models.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "Chain-of-thought effectiveness depends on task complexity and model reasoning ability, not uniformly beneficial",
    370       "evidence": "Tables 7–8: CoT hurts GPT-4.1 on HumanEval (Top@5 drops 50% → 34.36%), but helps DeepSeek on Defects4J (Top@5 rises 11.56% → 19.11%). Pattern is model and dataset dependent.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Model inference latency ranges from 1-2 seconds (GPT-4.1 mini) to 20-30+ seconds (Gemini-2.5-flash) depending on task and prompting strategy",
    375       "evidence": "Section 4.4.1: GPT-4.1 mini averages 1-3s, Qwen 2-5s, DeepSeek 3-9s, Gemini 9-30+ seconds. CoT introduces largest latency overhead for Gemini.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "API call costs vary from $0.024 (GPT-4.1 mini) to $1.917 (GPT on Defects4J) per call, with open-source models generally cheaper in local currency",
    380       "evidence": "Tables 9–10: GPT-4.1 mini $0.024–$1.917, Qwen ￥0.144–￥8.642, DeepSeek ￥0.153–￥10.541, Gemini $0.030–$1.344",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Bugs in Defects4J (real projects) are substantially harder to localize than HumanEval-Java (synthetic), with performance drops of 2-3× across all models",
    385       "evidence": "Table 3 vs Table 4: Gemini drops from 65.03% to 23.67% Top@5; GPT from 50% to 15.15%; performance consistently 2–3× lower on real-world Defects4J",
    386       "supported": "strong"
    387     }
    388   ],
    389   "methodology_tags": [
    390     "benchmark-eval",
    391     "observational",
    392     "comparative"
    393   ],
    394   "key_findings": "The paper systematically evaluated four LLMs (open and closed-source) on Java fault localization across synthetic (HumanEval-Java, 164 bugs) and real-world (Defects4J, 395 bugs) datasets. Gemini-2.5-flash achieved strongest overall performance (65% Top@5 on synthetic, 24% on real), while bug report context provided consistent 3-4× improvements across all models. Few-shot learning showed modest gains peaking at two-shot examples with diminishing returns; chain-of-thought had inconsistent effects—hurting GPT-4.1 on synthetic tasks but aiding DeepSeek on real-world bugs. Trade-offs between accuracy (24%–65% depending on task), inference latency (1–30 seconds), and API cost ($0.024–$10.541 per call) suggest practical model selection requires balancing budget, speed, and precision.",
    395   "red_flags": [
    396     {
    397       "flag": "No confidence intervals or variance",
    398       "detail": "Main results (Tables 3–8) report only point estimates from 13 trials. Impossible to assess statistical significance or confidence bounds on performance differences."
    399     },
    400     {
    401       "flag": "Prompts not released",
    402       "detail": "Paper mentions 'standardized templates' but never provides actual prompt text. Reproducibility critically impaired without exact prompts used."
    403     },
    404     {
    405       "flag": "Data contamination risk underaddressed",
    406       "detail": "Gemini-2.5-flash has Jan 2025 knowledge cutoff; paper written in 2025. Potential overlap with benchmarks not fully verified. Acknowledged in threats but not resolved."
    407     },
    408     {
    409       "flag": "Strict 'perfect match' evaluation may inflate error rates",
    410       "detail": "Paper requires complete line-by-line match; partial credit not reported. Hides whether models are close but wrong, and lacks granularity."
    411     },
    412     {
    413       "flag": "Limited to Java only",
    414       "detail": "All experiments on Java code. Generalization to C++, Python, JavaScript, etc., unverified."
    415     },
    416     {
    417       "flag": "CoT findings unexplained",
    418       "detail": "CoT hurts GPT-4.1 on HumanEval but helps DeepSeek on Defects4J. Paper acknowledges model and task dependency but offers no mechanistic explanation."
    419     },
    420     {
    421       "flag": "No per-category failure analysis",
    422       "detail": "No breakdown by bug type (logic, type error, boundary, etc.). Unclear which models/strategies fail on which bug categories."
    423     },
    424     {
    425       "flag": "Sample size not justified",
    426       "detail": "13 trials chosen without power analysis or justification. Unclear if sufficient for stable estimates."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "Software testing with large language models: Survey, landscape, and vision",
    432       "relevance": "Broad survey of LLM applications in software testing; contextualizes fault localization within larger testing automation landscape",
    433       "authors": "Wang et al.",
    434       "year": 2024
    435     },
    436     {
    437       "title": "A Survey of LLMs for Software Engineering",
    438       "relevance": "Comprehensive review of LLM capabilities across software engineering tasks including program repair and code analysis",
    439       "authors": "Chen et al.",
    440       "year": 2023
    441     },
    442     {
    443       "title": "Code generation with LLMs: Evaluation, challenges and opportunities",
    444       "relevance": "Evaluation methodology and metrics for LLM code tasks; informs design of benchmarks and evaluation protocols",
    445       "authors": "Xu et al.",
    446       "year": 2024
    447     },
    448     {
    449       "title": "Large language models in fault localization",
    450       "relevance": "Directly related prior work evaluating ChatGPT on Defects4J; establishes baseline for LLM fault localization",
    451       "authors": "Wu et al.",
    452       "year": 2023
    453     },
    454     {
    455       "title": "LLMAO: LLMs for Test-Free Fault Localization",
    456       "relevance": "Test-free fault localization approach using LLMs; alternative methodology to the bug-report-context baseline",
    457       "authors": "Yang et al.",
    458       "year": 2024
    459     },
    460     {
    461       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    462       "relevance": "Foundational paper on CoT prompting; motivates RQ3 exploration of CoT effectiveness for code reasoning",
    463       "authors": "Wei et al.",
    464       "year": 2022
    465     },
    466     {
    467       "title": "Evaluating fault localization and program repair capabilities of existing closed-source general-purpose LLMs",
    468       "relevance": "Comparative study of GPT-4 and Claude on fault localization and repair; overlapping research question and models",
    469       "authors": "Jiang et al.",
    470       "year": 2024
    471     },
    472     {
    473       "title": "GitBug-Java: A reproducible benchmark of recent Java bugs",
    474       "relevance": "Recent Java bug dataset designed for reproducibility; related to Defects4J benchmark selection",
    475       "authors": "Silva et al.",
    476       "year": 2024
    477     }
    478   ],
    479   "engagement_factors": {
    480     "practical_relevance": {
    481       "score": 2,
    482       "justification": "Fault localization is a real industrial problem. Results are practical (time/cost analysis) but no novel methods proposed—purely observational comparison of existing APIs."
    483     },
    484     "surprise_contrarian": {
    485       "score": 1,
    486       "justification": "Findings confirm conventional wisdom: Gemini is strong, context helps, few-shot shows diminishing returns, CoT is inconsistent. No surprising results that challenge prior understanding."
    487     },
    488     "fear_safety": {
    489       "score": 0,
    490       "justification": "Paper is purely empirical evaluation on safe benchmarks. No AI safety, adversarial, or risk-related concerns raised or explored."
    491     },
    492     "drama_conflict": {
    493       "score": 0,
    494       "justification": "Straightforward technical comparison. No controversy, debate, or conflicting findings presented."
    495     },
    496     "demo_ability": {
    497       "score": 1,
    498       "justification": "Approaches are doable (call public APIs with different prompts) but no released code, prompts, or notebooks make demo/reproduction difficult."
    499     },
    500     "brand_recognition": {
    501       "score": 2,
    502       "justification": "Tests well-known models from major companies (OpenAI GPT-4, Google Gemini, Alibaba Qwen, DeepSeek) but no novel models or lesser-known labs."
    503     }
    504   },
    505   "hn_data": {
    506     "threads": [
    507       {
    508         "hn_id": "38424009",
    509         "title": "Does GPT-4 Pass the Turing Test?",
    510         "points": 60,
    511         "comments": 88,
    512         "url": "https://news.ycombinator.com/item?id=38424009"
    513       },
    514       {
    515         "hn_id": "38093289",
    516         "title": "Does GPT-4 Pass the Turing Test?",
    517         "points": 5,
    518         "comments": 1,
    519         "url": "https://news.ycombinator.com/item?id=38093289"
    520       },
    521       {
    522         "hn_id": "10444607",
    523         "title": "From F to DOT: Type Soundness Proofs with Definitional Interpreters",
    524         "points": 2,
    525         "comments": 2,
    526         "url": "https://news.ycombinator.com/item?id=10444607"
    527       }
    528     ],
    529     "top_points": 60,
    530     "total_points": 67,
    531     "total_comments": 91
    532   }
    533 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs