scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26873B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Importance Sampling is All You Need: Predict LLM's performance on new benchmark by reusing existing benchmark",
      6     "authors": [
      7       "Junjie Shi",
      8       "Wei Ma",
      9       "Shi Ying",
     10       "Lingxiao Jiang",
     11       "Yang Liu",
     12       "Bo Du"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2508.01203",
     17     "doi": "10.48550/arXiv.2508.01203"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Abstract claims 1.1% average absolute error for CodeBLEU and 2.15% for pass@1 are directly supported by Tables 3–4; minor inconsistency between abstract (1.1%) and introduction (0.9%) is noted but does not materially misrepresent the results.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper frames prompt distributions as causally determining LLM performance but the design is predictive/observational; the claim that BIS 'significantly reduces data contamination risks' is entirely theoretical with no empirical validation of contamination detection accuracy.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Experiments use only the CodeLlama family (all sharing the same lineage) and two benchmark clusters with similar coding domains; the abstract and conclusion generalize broadly to 'LLMs in code-related tasks' without caveating the single-family limitation.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No alternative explanations are offered for why IWAE outperforms baselines or why semantic metrics are predicted more accurately than code-level ones beyond 'token-level randomness'; the Discussion reframes limitations as future work rather than considering competing interpretations.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper clearly states it predicts specific automated metrics (CodeBLEU, pass@1, cyclomatic complexity, security scores) and does not conflate these proxies with broader notions of 'code quality' or developer productivity.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Section 5 is titled 'Discussion' and frames limitations primarily as 'promising avenues for future research' (cross-domain, cross-language, closed-source models) rather than a dedicated limitations or threats-to-validity section.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The paper never identifies concrete threats such as single-model-family bias, CodeBLEU's known limitations as a quality proxy, or the circularity of evaluating benchmark prediction using benchmarks from the same domain cluster.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "While cross-domain limitations are acknowledged, the paper does not explicitly state what the results do NOT show (e.g., no claim to work with instruction-tuned or RLHF models, no claim beyond same-language pairs).",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Section 7 Acknowledgement clearly states support from the Ministry of Education, Singapore, Academic Research Fund Tier 3 (Award ID: MOET32020-0004).",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All six authors list institutional affiliations (NTU, SMU, Wuhan University) in the paper header; the work evaluates open-source models with no commercial affiliations.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The funder is the Singapore Ministry of Education providing academic research grants; it has no stake in the CodeLlama models or benchmarks being evaluated.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "There is no competing interests statement or declaration of patents, equity, or consulting relationships; the acknowledgement only covers funding.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Importance sampling, IWAE, prompt distribution, and all major notation are explicitly defined in Sections 2–3 and Table 1; key metrics (CodeBLEU, cyclomatic complexity, security scores) are formally defined in Section 4.3.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Three numbered contributions are explicitly listed at the end of the Introduction: theoretical formalization, the BIS framework itself, and empirical validation of IWAE integration.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 2 engages with code benchmarking literature, importance sampling theory, and VAE/IWAE work, explaining how BIS extends prior methods; comparisons against prior approaches are made empirically in Section 4.4.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "No code repository or release link is mentioned anywhere in the paper.",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "All benchmarks used (BigCodeBench, HumanEval, EvoEval) are publicly available; no private data was created.",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "The paper mentions L20 GPUs and BERT embeddings but provides no requirements file, Dockerfile, or software version specifications.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No step-by-step instructions are provided; the architectural description in Section 3.2 is conceptual and would require substantial implementation guesswork.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "All results in Tables 3–13 are point estimates only; no confidence intervals or error bars are reported for any result.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "Comparative claims (BIS outperforms all baselines) are made without any statistical significance tests or p-values.",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Absolute error values are reported across all conditions, providing a consistent effect-size measure; Table 7 includes 'avg of abs' summaries enabling direct magnitude comparison.",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "The 8,528 data points are presented as a practical collection from available benchmarks with no power analysis or justification for why this quantity is sufficient.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "No variance or standard deviation across repeated runs is reported; each configuration appears to be run once with no measure of stability.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Section 4.4 compares BIS against 8 baselines: GMM (2 configs), RBM (2 configs), MaxEnt (2 configs), VAE, RSR, LR, DTR, RR, MLP, and RNN.",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "The baselines (VAE, GMM, ridge regression, MLP, RNN) are standard and appropriate for this distribution-fitting and regression task; they span both statistical and deep learning approaches.",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Section 4.5 explicitly performs ablation on four factors: embedding dimensionality (PCA vs linear), prompt set size, number of IWAE samples, and weight truncation percentile.",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Results are reported for CodeBLEU, pass@1, cyclomatic complexity, security scores, and four Halstead code-level metrics (length, volume, effort, time).",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": false,
    206           "answer": false,
    207           "justification": "The paper evaluates automated code quality metrics only; human evaluation is not applicable to this benchmark prediction task.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "The cross-prediction design (BigCode predicts Evo, Evo predicts BigCode) uses each dataset as a held-out test set for the other, providing genuine out-of-distribution evaluation.",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Results are consistently broken down by model size (7B/13B/34B/70B), by source dataset direction, and by metric type (semantic vs code-level).",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": false,
    225           "justification": "Section 5 acknowledges failure conditions (extreme weight distributions, cross-domain settings) theoretically but does not present actual empirical failure cases with concrete examples.",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "The paper clearly reports that code-level metrics (length, volume) are harder to predict with MAEs up to 10.7%, and that reducing sample size to 100 causes substantial performance degradation.",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "Specific CodeLlama model sizes (7B, 13B, 34B, 70B) and BERT for embeddings are named; CodeLlama is open-source with deterministic checkpoints.",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": false,
    245           "justification": "No prompting templates or system instructions used to query CodeLlama are provided; only the benchmark prompts themselves (from public datasets) are referenced.",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": false,
    251           "justification": "The chosen IWAE sample count (10) and truncation percentile (0.9) are identified, but learning rate, batch size, training epochs, and BERT embedding configuration are not reported.",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "There is no agentic scaffolding; the paper evaluates LLM outputs on fixed code generation benchmarks without any scaffolding layers.",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "The paper documents merging HumanEval with EvoEval into 'Evo', min-max normalization procedure, and the cross-prediction framework setup.",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "The raw model outputs (LLM-generated code and scores) collected during experiments are not made available; no data repository is linked.",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "The collection procedure is described: 4 CodeLlama models run on 9 benchmarks (2,132 prompts each = ~8,528 total), using open-source models released before benchmark publication to limit contamination.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": false,
    282           "answer": false,
    283           "justification": "Standard public benchmarks are used; no participant recruitment is involved.",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The full pipeline from benchmark prompts → BERT embeddings → IWAE training → importance weight computation → weighted score prediction is documented mathematically in Section 3.2.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "The paper says open-source models 'released before the publication of these benchmarks' were selected, but does not state the actual training data cutoff dates for any CodeLlama variant.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": true,
    303           "justification": "Data contamination is explicitly one of the two central motivating problems; the choice to use only open-source models released before benchmark publication is directly justified on contamination grounds.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": true,
    309           "justification": "The paper deliberately restricts to CodeLlama models predating the benchmarks and discusses the contamination problem extensively in both Introduction and Section 5.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No human participants.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants.",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants.",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": true,
    361           "justification": "Section 4.1 states the evaluation was conducted by renting 8 servers with L20 GPUs at a total cost of $280.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": true,
    367           "justification": "$280 across 8 L20 GPU servers is reported as the total computational budget for the experiments.",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "BIS achieves an average absolute prediction error of 1.1% for CodeBLEU code correctness scores across 4 CodeLlama models and 2 benchmark clusters.",
    376       "evidence": "Table 3 shows BigCode-source avg absolute error 0.8% and Evo-source 1.4%, averaging 1.1%.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "BIS outperforms all baseline methods including GMM, RBM, MaxEnt, VAE, and regression models under both importance-sampling and non-importance-sampling frameworks.",
    381       "evidence": "Tables 7–8 show BIS achieving avg-of-abs error 0.011 vs. next-best VAE at 0.015 and MaxEnt at 0.017.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "The framework generalizes to metrics beyond CodeBLEU, including cyclomatic complexity (4.6% MAE), security scores (4.3% MAE), and Halstead code-level metrics (up to 10.7% MAE).",
    386       "evidence": "Table 6 reports per-metric, per-model errors across 7 additional metrics.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Prompt distributions alone are sufficient to predict LLM code generation performance without executing generated code or using reference solutions.",
    391       "evidence": "The theoretical derivation in Section 3.3 and empirical results across 9 benchmarks support this, but only for one model family (CodeLlama) in similar coding domains.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "BIS can significantly mitigate data contamination risks by eliminating reliance on test suites and reference solutions.",
    396       "evidence": "This is presented as a theoretical property; no empirical test of contamination detection is performed.",
    397       "supported": "weak"
    398     }
    399   ],
    400   "methodology_tags": [
    401     "benchmark-eval",
    402     "theoretical"
    403   ],
    404   "key_findings": "BIS uses Importance Weighted Autoencoders to model prompt distributions and reweight source benchmark scores to predict target benchmark performance, achieving ~1.1% average absolute error on CodeBLEU for CodeLlama 7B–70B without executing code. The framework outperforms GMM, VAE, RBM, and regression baselines, and extends to cyclomatic complexity and security metrics (3–5% MAE) though performance degrades for Halstead code-level metrics (up to 10.7%). Ablation studies identify optimal IWAE sample count (~10) and weight truncation percentile (0.9), and show that PCA dimensionality reduction is far superior to linear projection. The contamination-reduction motivation is compelling but unvalidated empirically.",
    405   "red_flags": [
    406     {
    407       "flag": "Single model family only",
    408       "detail": "All experiments use CodeLlama variants (7B–70B), which share architecture and training lineage. Generalization to other LLM families is untested but broadly claimed."
    409     },
    410     {
    411       "flag": "No statistical significance testing",
    412       "detail": "All comparative claims (BIS is best) are made on point estimates without confidence intervals or hypothesis tests, making it impossible to assess whether observed differences are meaningful."
    413     },
    414     {
    415       "flag": "Pass@1 tested on single model only",
    416       "detail": "Table 4 reports pass@1 only for CodeLlama-7B 'due to computational constraints', but the abstract and findings generalize pass@1 results without this caveat."
    417     },
    418     {
    419       "flag": "CodeBLEU as ground truth",
    420       "detail": "CodeBLEU is itself a widely-criticized proxy for code correctness; predicting CodeBLEU does not guarantee the framework is useful for predicting actual functional correctness."
    421     },
    422     {
    423       "flag": "No code or data released",
    424       "detail": "Neither the IWAE implementation nor the collected evaluation data are released, making independent replication impossible."
    425     },
    426     {
    427       "flag": "Abstract/intro metric inconsistency",
    428       "detail": "The abstract states 1.1% average error while the introduction states 0.9%; though minor, this internal inconsistency suggests insufficient proofreading."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "Evaluating large language models trained on code (HumanEval)",
    434       "relevance": "Primary benchmark used as source/target dataset; foundational LLM code evaluation paper"
    435     },
    436     {
    437       "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions",
    438       "relevance": "Second primary benchmark; used as source/target dataset; example of high-cost benchmark development that motivates this work"
    439     },
    440     {
    441       "title": "SWE-Bench: Can language models resolve real-world GitHub issues?",
    442       "relevance": "Representative example of realistic code benchmark with complex test suites"
    443     },
    444     {
    445       "title": "Top leaderboard ranking = top coding proficiency, always? EvoEval: Evolving coding benchmarks via LLM",
    446       "relevance": "Source of the 7 EvoEval sub-benchmarks used as the 'Evo' dataset in experiments"
    447     },
    448     {
    449       "title": "Importance weighted autoencoders (IWAE)",
    450       "relevance": "Core technical component of BIS; provides the distribution modeling method"
    451     },
    452     {
    453       "title": "Does Data Contamination Detection Work (Well) for LLMs? A Survey and Evaluation on Detection Assumptions",
    454       "relevance": "Background on the contamination problem that motivates this work"
    455     },
    456     {
    457       "title": "LessLeak-Bench: A First Investigation of Data Leakage in LLMs Across 83 Software Engineering Benchmarks",
    458       "relevance": "Recent empirical evidence showing benchmark contamination is a real and widespread problem"
    459     },
    460     {
    461       "title": "Code Llama: Open foundation models for code",
    462       "relevance": "The model family used in all experiments"
    463     }
    464   ],
    465   "engagement_factors": {
    466     "practical_relevance": {
    467       "score": 2,
    468       "justification": "Reducing benchmark construction costs is a genuine practical problem, but the current validation on one model family and similar benchmark domains limits immediate applicability."
    469     },
    470     "surprise_contrarian": {
    471       "score": 2,
    472       "justification": "The claim that prompt distributions alone (without code execution or test suites) can predict LLM performance challenges the implicit assumption in all prior benchmark-based evaluation."
    473     },
    474     "fear_safety": {
    475       "score": 0,
    476       "justification": "No AI safety or risk concerns are raised; the paper is a methodological contribution to benchmark evaluation."
    477     },
    478     "drama_conflict": {
    479       "score": 0,
    480       "justification": "Straightforward technical paper with no controversy, competing claims, or adversarial framing."
    481     },
    482     "demo_ability": {
    483       "score": 1,
    484       "justification": "The concept is demonstrable in principle, but no code is released and the setup requires running large LLMs on public benchmarks, limiting casual replication."
    485     },
    486     "brand_recognition": {
    487       "score": 0,
    488       "justification": "Authors are from NTU, SMU, and Wuhan University — credible academic institutions but not AI lab brands with large public followings."
    489     }
    490   },
    491   "hn_data": {
    492     "threads": [
    493       {
    494         "hn_id": "45327964",
    495         "title": "We Politely Insist: Your LLM Must Learn the Persian Art of Taarof",
    496         "points": 181,
    497         "comments": 122,
    498         "url": "https://news.ycombinator.com/item?id=45327964"
    499       },
    500       {
    501         "hn_id": "44455950",
    502         "title": "AI for Scientific Search",
    503         "points": 125,
    504         "comments": 34,
    505         "url": "https://news.ycombinator.com/item?id=44455950"
    506       },
    507       {
    508         "hn_id": "32367085",
    509         "title": "From maximum force to physics in 9 lines – and implications for quantum gravity",
    510         "points": 48,
    511         "comments": 50,
    512         "url": "https://news.ycombinator.com/item?id=32367085"
    513       },
    514       {
    515         "hn_id": "42993305",
    516         "title": "The Differences Between Direct Alignment Algorithms Are a Blur",
    517         "points": 8,
    518         "comments": 0,
    519         "url": "https://news.ycombinator.com/item?id=42993305"
    520       },
    521       {
    522         "hn_id": "40333494",
    523         "title": "From maximum force to physics in 9 lines",
    524         "points": 3,
    525         "comments": 1,
    526         "url": "https://news.ycombinator.com/item?id=40333494"
    527       },
    528       {
    529         "hn_id": "28075769",
    530         "title": "The MIT Supercloud Dataset",
    531         "points": 3,
    532         "comments": 0,
    533         "url": "https://news.ycombinator.com/item?id=28075769"
    534       },
    535       {
    536         "hn_id": "44591216",
    537         "title": "Rethinking the Illusion of Thinking",
    538         "points": 2,
    539         "comments": 1,
    540         "url": "https://news.ycombinator.com/item?id=44591216"
    541       },
    542       {
    543         "hn_id": "19815852",
    544         "title": "What’s Wrong with Risk Matrices?",
    545         "points": 2,
    546         "comments": 1,
    547         "url": "https://news.ycombinator.com/item?id=19815852"
    548       },
    549       {
    550         "hn_id": "44517330",
    551         "title": "A Survey on Latent Reasoning",
    552         "points": 2,
    553         "comments": 0,
    554         "url": "https://news.ycombinator.com/item?id=44517330"
    555       },
    556       {
    557         "hn_id": "43176012",
    558         "title": "Discovering Chunks in Neural Embeddings for Interpretability",
    559         "points": 2,
    560         "comments": 0,
    561         "url": "https://news.ycombinator.com/item?id=43176012"
    562       }
    563     ],
    564     "top_points": 181,
    565     "total_points": 376,
    566     "total_comments": 209
    567   }
    568 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs