scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29473B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Large Language Model Unlearning for Source Code",
      6     "authors": [
      7       "Xue Jiang",
      8       "Yihong Dong",
      9       "Huangzhao Zhang",
     10       "Tangxinyu Wang",
     11       "Zheng Fang",
     12       "Yingwei Ma",
     13       "Rongyu Cao",
     14       "Binhua Li",
     15       "Zhi Jin",
     16       "Wenpin Jiao",
     17       "Yongbin Li",
     18       "Ge Li"
     19     ],
     20     "year": 2025,
     21     "venue": "arXiv.org",
     22     "arxiv_id": "2506.17125",
     23     "doi": "10.48550/arXiv.2506.17125"
     24   },
     25   "checklist": {
     26     "claims_and_evidence": {
     27       "abstract_claims_supported": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "All abstract claims (superior PDR, robustness to adversarial attacks, broad LLM applicability) are directly supported by Table 1, Figures 3–5, and the multi-LLM experiments in Figure 4.",
     31         "source": "haiku"
     32       },
     33       "causal_claims_justified": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Ablation studies (Figure 6) isolate the effects of loss function choice and hyperparameters; comparative experiments control for all variables except the unlearning method, providing adequate support for causal claims about PROD's advantages.",
     37         "source": "haiku"
     38       },
     39       "generalization_bounded": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The abstract and conclusion state PROD 'holds significant implications for advancing reliable code generation' and 'consistently exhibiting improvements,' but experiments are limited to four 7B-parameter models; generalization to larger models or other architectures is not tested.",
     43         "source": "haiku"
     44       },
     45       "alternative_explanations_discussed": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper attributes PROD's success entirely to token-level granularity but does not consider alternative explanations such as whether the intentional memorization fine-tuning step inflates baseline degradation, or whether PROD's lower forget thoroughness on some metrics explains retained utility.",
     49         "source": "haiku"
     50       },
     51       "proxy_outcome_distinction": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "The paper explicitly labels 1−BLEU as 'an indicator of potential copyright infringement' and says model utility is 'estimated' through HumanEval, acknowledging the proxy nature of both measurements.",
     55         "source": "haiku"
     56       }
     57     },
     58     "limitations_and_scope": {
     59       "limitations_section_present": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "There is no dedicated limitations or threats-to-validity section; the conclusion lists only strengths and contributions without acknowledging scope constraints.",
     63         "source": "haiku"
     64       },
     65       "threats_to_validity_specific": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No threats to validity are discussed anywhere in the paper; the intentional memorization fine-tuning step used for copyright/insecurity tasks (which may not reflect real-world memorization) is not flagged as a validity concern.",
     69         "source": "haiku"
     70       },
     71       "scope_boundaries_stated": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "The paper does not state what the results do NOT show; there is no explicit acknowledgment that findings are bounded to 7B models, specific task formats, or the tested code domains.",
     75         "source": "haiku"
     76       }
     77     },
     78     "conflicts_of_interest": {
     79       "funding_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Funding is disclosed: National Key R&D Program No. 2023YFB4503801, NSFC grants 62192733/62192730/62192731, and Major Program of Hubei Province No. 2023BAA024.",
     83         "source": "haiku"
     84       },
     85       "affiliations_disclosed": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Author affiliations with Peking University and Tongyi Lab (Alibaba Group) are disclosed; the internship context is also noted.",
     89         "source": "haiku"
     90       },
     91       "funder_independent_of_outcome": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The listed funders are Chinese government research agencies independent of the study outcome; however, two authors are affiliated with Alibaba's Tongyi Lab and one evaluated model (Qwen2.5-Coder-7B) is an Alibaba product, a potential unconflated interest.",
     95         "source": "haiku"
     96       },
     97       "financial_interests_declared": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No competing interests statement is provided; the Alibaba affiliation of two authors evaluating an Alibaba model (Qwen2.5-Coder) is not flagged as a financial interest.",
    101         "source": "haiku"
    102       }
    103     },
    104     "scope_and_framing": {
    105       "key_terms_defined": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Key terms are precisely defined: 'unlearning' is formalized mathematically (Eq. 1), 'forget quality' and 'model utility' are defined with specific metrics, and 'PDR' is defined formally (Eq. 6).",
    109         "source": "haiku"
    110       },
    111       "intended_contribution_clear": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Four contributions are explicitly enumerated: (1) empirical investigation of existing methods on code tasks, (2) PROD method, (3) benchmark covering three tasks, and (4) PDR metric.",
    115         "source": "haiku"
    116       },
    117       "engagement_with_prior_work": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper provides detailed technical descriptions of all four baselines (GA, NPO, DPO, FLAT) including their formulations and explains why their coarse granularity fails on code, situating PROD as a targeted improvement.",
    121         "source": "haiku"
    122       }
    123     }
    124   },
    125   "type_checklist": {
    126     "empirical": {
    127       "artifacts": {
    128         "code_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "Source code and data are released at https://github.com/jiangxxxue/PROD as stated in the abstract and contributions section.",
    132           "source": "haiku"
    133         },
    134         "data_released": {
    135           "applies": true,
    136           "answer": true,
    137           "justification": "All three forget datasets use publicly available corpora: The Stack (copyright), CyberSecEval/Purple Llama (insecure code), and VersiCode (deprecated APIs); HumanEval is used for utility.",
    138           "source": "haiku"
    139         },
    140         "environment_specified": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "The paper mentions 4 NVIDIA A100 GPUs and Python-level hyperparameters but provides no requirements.txt, Dockerfile, or dependency specification in the paper text.",
    144           "source": "haiku"
    145         },
    146         "reproduction_instructions": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "While hyperparameters are listed in the implementation details section, no step-by-step reproduction instructions are included in the paper itself; readers must infer the pipeline from the description.",
    150           "source": "haiku"
    151         }
    152       },
    153       "statistical_methodology": {
    154         "confidence_intervals_or_error_bars": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "Main results in Table 1 report only point estimates for PDR; Figure 5 shows min/max ranges for adversarial attack results only, not for primary comparative claims.",
    158           "source": "haiku"
    159         },
    160         "significance_tests": {
    161           "applies": true,
    162           "answer": false,
    163           "justification": "No statistical significance tests are applied to any comparative claims despite training being repeated five times with different seeds.",
    164           "source": "haiku"
    165         },
    166         "effect_sizes_reported": {
    167           "applies": true,
    168           "answer": true,
    169           "justification": "The paper reports a '124% average relative gain over the strongest competitor' in PDR, and absolute PDR values are given with baseline context in Table 1.",
    170           "source": "haiku"
    171         },
    172         "sample_size_justified": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "The choice of 100 files for the copyright forget set, 1,916 snippets for insecure code, and 252 packages for deprecated APIs is not justified by power analysis or statistical reasoning.",
    176           "source": "haiku"
    177         },
    178         "variance_reported": {
    179           "applies": true,
    180           "answer": false,
    181           "justification": "Five random seeds are used but Table 1 reports only mean PDR values without standard deviations; variance is only shown for adversarial attack results in Figure 5.",
    182           "source": "haiku"
    183         }
    184       },
    185       "evaluation_design": {
    186         "baselines_included": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Four baselines are included: Gradient Ascent (GA), Direct Preference Optimization (DPO), Negative Preference Optimization (NPO), and FLAT.",
    190           "source": "haiku"
    191         },
    192         "baselines_contemporary": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "All four baselines are from 2023–2025 publications representing the current state of LLM unlearning; FLAT (Wang et al. 2025) and NPO (Zhang et al. 2024) are very recent.",
    196           "source": "haiku"
    197         },
    198         "ablation_study": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Figure 6 presents ablations on loss function type (CE, KL, JS), nucleus sampling threshold p (0.2, 0.4, 0.8, 0.9, 1.0), and suppression strength α across six values.",
    202           "source": "haiku"
    203         },
    204         "multiple_metrics": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "The evaluation uses task-specific forget quality metrics (1−BLEU, pass rate under static analysis, exact match), HumanEval pass rate for utility, and the composite PDR metric.",
    208           "source": "haiku"
    209         },
    210         "human_evaluation": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "Three volunteer evaluators with 2+ years of software development experience conducted a blinded perceptual quality comparison of 20 samples per task; win rates are reported in Table 2.",
    214           "source": "haiku"
    215         },
    216         "held_out_test_set": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Each task uses a held-out test structure: second half of copyright files for evaluation, separate test prompts for insecure code, and temporally split still-valid APIs for deprecated API evaluation.",
    220           "source": "haiku"
    221         },
    222         "per_category_breakdown": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Table 1 reports PDR scores separately for all three tasks (copyright, insecurity, deprecation), and Figure 3 shows separate scatter plots for each task.",
    226           "source": "haiku"
    227         },
    228         "failure_cases_discussed": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Figure 1 presents a detailed case study of three failure modes in baselines (mute refusal, token collapse, syntactic incoherence); Figure 6 shows PROD's performance degradation under poor hyperparameter choices.",
    232           "source": "haiku"
    233         },
    234         "negative_results_reported": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "The ablation in Figure 6 explicitly shows that removing noise elimination (p=1.0) yields the worst performance, and that certain α values degrade forgetting or utility.",
    238           "source": "haiku"
    239         }
    240       },
    241       "setup_transparency": {
    242         "model_versions_specified": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Specific model versions are cited: CodeLlama-7B, Qwen2.5-Coder-7B, Deepseek-coder-6.7B, and Starcoder-7B, each referencing a specific technical report.",
    246           "source": "haiku"
    247         },
    248         "prompts_provided": {
    249           "applies": false,
    250           "answer": false,
    251           "justification": "This is a fine-tuning/unlearning paper, not a prompting study; the 'prompts' are the code prefixes from the benchmark datasets rather than engineered system instructions.",
    252           "source": "haiku"
    253         },
    254         "hyperparameters_reported": {
    255           "applies": true,
    256           "answer": true,
    257           "justification": "Learning rate (2e-5 for memorization, grid search from {1e-4 to 1e-6} for unlearning), batch size (32), epochs (10), optimizer (AdamW, weight_decay=0.01), sequence length (1024), and method-specific parameters (β=0.1, p=0.8, α=0) are all reported.",
    258           "source": "haiku"
    259         },
    260         "scaffolding_described": {
    261           "applies": false,
    262           "answer": false,
    263           "justification": "This is not an agentic AI paper; no agentic scaffolding is used.",
    264           "source": "haiku"
    265         },
    266         "data_preprocessing_documented": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "Preprocessing is documented for all tasks: copyright filtering criteria (removing test files, templates), insecure code adoption from CyberSecEval, and three-step deprecated API processing (version filtration, file filtration, temporal split).",
    270           "source": "haiku"
    271         }
    272       },
    273       "data_integrity": {
    274         "raw_data_available": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "All source datasets are publicly available (The Stack, CyberSecEval, VersiCode, HumanEval), and the code repository is released.",
    278           "source": "haiku"
    279         },
    280         "data_collection_described": {
    281           "applies": true,
    282           "answer": true,
    283           "justification": "Data curation is described for each task with specific filtering criteria, dataset sizes (100 files, 1,916 snippets, 252 packages/3,449 snippets), and evaluation setup.",
    284           "source": "haiku"
    285         },
    286         "recruitment_methods_described": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "Human evaluator recruitment is minimally described: 'three independent volunteer evaluators (each with two years or more of software development experience).'",
    290           "source": "haiku"
    291         },
    292         "data_pipeline_documented": {
    293           "applies": true,
    294           "answer": true,
    295           "justification": "The full pipeline from dataset curation through memorization fine-tuning, unlearning application, and evaluation is described in the Evaluation and Implementation Details sections.",
    296           "source": "haiku"
    297         }
    298       },
    299       "contamination": {
    300         "training_cutoff_stated": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "The training data cutoffs for CodeLlama-7B, Qwen2.5-Coder-7B, Deepseek-coder-6.7B, and Starcoder-7B are not stated, which is relevant since the paper intentionally targets snippets these models may have memorized.",
    304           "source": "haiku"
    305         },
    306         "train_test_overlap_discussed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "The paper does not discuss whether HumanEval (used for model utility) was in the training data of the evaluated models, nor whether the Stack snippets used as forget data were actually in training corpora.",
    310           "source": "haiku"
    311         },
    312         "benchmark_contamination_addressed": {
    313           "applies": true,
    314           "answer": false,
    315           "justification": "HumanEval is a well-known benchmark likely present in all evaluated models' training data; this is not acknowledged as a potential confound for the utility measurement.",
    316           "source": "haiku"
    317         }
    318       },
    319       "human_studies": {
    320         "pre_registered": {
    321           "applies": true,
    322           "answer": false,
    323           "justification": "The perceptual quality study with human evaluators is not pre-registered.",
    324           "source": "haiku"
    325         },
    326         "irb_or_ethics_approval": {
    327           "applies": true,
    328           "answer": false,
    329           "justification": "No IRB or ethics approval is mentioned for the human evaluator study.",
    330           "source": "haiku"
    331         },
    332         "demographics_reported": {
    333           "applies": true,
    334           "answer": false,
    335           "justification": "Only professional experience level (2+ years software development) is reported; no age, gender, or other demographic information is provided for the three evaluators.",
    336           "source": "haiku"
    337         },
    338         "inclusion_exclusion_criteria": {
    339           "applies": true,
    340           "answer": true,
    341           "justification": "The inclusion criterion '2+ years of software development experience' is stated for human evaluators.",
    342           "source": "haiku"
    343         },
    344         "randomization_described": {
    345           "applies": true,
    346           "answer": true,
    347           "justification": "'We randomly select 20 samples from each of the three downstream tasks' for human evaluation.",
    348           "source": "haiku"
    349         },
    350         "blinding_described": {
    351           "applies": true,
    352           "answer": true,
    353           "justification": "'Each human evaluator assesses all samples without knowing which approach produced each sample' — blinding is explicitly described.",
    354           "source": "haiku"
    355         },
    356         "attrition_reported": {
    357           "applies": false,
    358           "answer": false,
    359           "justification": "With only three evaluators in a one-shot perceptual study, attrition is not applicable.",
    360           "source": "haiku"
    361         }
    362       },
    363       "cost_and_practicality": {
    364         "inference_cost_reported": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "No inference latency or cost is reported; the paper mentions hardware (4 A100 GPUs) but provides no inference timing data.",
    368           "source": "haiku"
    369         },
    370         "compute_budget_stated": {
    371           "applies": true,
    372           "answer": false,
    373           "justification": "Hardware (4 NVIDIA A100 GPUs) is mentioned but total GPU-hours, training time, or compute cost is not stated.",
    374           "source": "haiku"
    375         }
    376       }
    377     }
    378   },
    379   "claims": [
    380     {
    381       "claim": "Existing unlearning methods (GA, NPO, DPO, FLAT) cause near-total utility degradation when applied to source code at high forget quality.",
    382       "evidence": "Figure 3 shows all four baselines achieve ~90% forgetting but their HumanEval utility drops to near zero; Figure 1 illustrates mute refusal, token collapse, and syntactic incoherence.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "PROD achieves superior Pareto Dominance Ratio (PDR) over all baselines across all three tasks.",
    387       "evidence": "Table 1: PROD scores 41.8%, 70.4%, 58.0% PDR on copyright/insecurity/deprecation tasks vs best baselines of 31.5%, 24.5%, 51.6% respectively.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "PROD generalizes consistently across four distinct code LLMs of varying architectures.",
    392       "evidence": "Figure 4 shows all four models (CodeLlama-7B, Qwen2.5-Coder-7B, Deepseek-coder-6.7B, Starcoder-7B) achieve near-perfect forget quality while maintaining substantially higher utility than baselines.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "PROD is the only unlearning method robust to prefix injection adversarial attacks.",
    397       "evidence": "Figure 5 shows PROD's similarity to copyrighted code stays below 0.05 under attack while all baselines exceed 0.3; PROD also shows smallest variance.",
    398       "supported": "strong"
    399     },
    400     {
    401       "claim": "Token-level granularity (vs. sample-level) is the key mechanism preventing utility collapse.",
    402       "evidence": "The theoretical argument is coherent and supported by the contrast between PROD and baselines, but no direct ablation isolates granularity level as a variable independent of the redistribution mechanism.",
    403       "supported": "moderate"
    404     },
    405     {
    406       "claim": "PROD achieves >70% win rate against all baselines in human perceptual quality evaluation.",
    407       "evidence": "Table 2 shows win rates of 81%, 92%, 76%, 87% vs GA, DPO, NPO, FLAT respectively (human); GPT-4 evaluation shows 77%, 86%, 73%, 81%.",
    408       "supported": "moderate"
    409     }
    410   ],
    411   "methodology_tags": [
    412     "benchmark-eval",
    413     "observational"
    414   ],
    415   "key_findings": "PROD introduces token-level probabilistic redistribution for LLM code unlearning, surgically zeroing target token probabilities and renormalizing over remaining vocabulary to preserve programming language knowledge. In experiments across three unlearning tasks (copyright, insecure code, deprecated APIs) on four 7B code LLMs, PROD achieves 41–70% PDR versus near-zero for existing methods, which collapse model utility while achieving forget quality. PROD is also the only method resilient to prefix injection adversarial attacks, maintaining similarity below 0.05 to forgotten content. The paper simultaneously introduces a benchmark and PDR metric for multi-objective unlearning evaluation.",
    416   "red_flags": [
    417     {
    418       "flag": "Trivially small human study",
    419       "detail": "The perceptual quality evaluation uses only 3 volunteer evaluators assessing 20 samples per task — far too few for reliable inter-rater statistics or significance testing."
    420     },
    421     {
    422       "flag": "No statistical significance testing",
    423       "detail": "All comparative claims (PDR, win rates, adversarial attack robustness) are made without significance tests despite 5-seed training runs, making it impossible to assess whether differences are statistically meaningful."
    424     },
    425     {
    426       "flag": "Variance omitted from main results",
    427       "detail": "Table 1 reports only mean PDR values without standard deviations across the 5 random seeds; variance is reported only for adversarial attack results."
    428     },
    429     {
    430       "flag": "Artificial memorization step inflates baseline degradation",
    431       "detail": "For copyright and insecurity tasks, models are first fine-tuned on forget data to ensure memorization before unlearning. This setup may exaggerate how severely existing methods degrade utility compared to real-world memorization patterns."
    432     },
    433     {
    434       "flag": "No limitations section",
    435       "detail": "The paper lacks any dedicated limitations or threats-to-validity section; scope boundaries (7B models only, specific task formats, English code) are not explicitly stated."
    436     },
    437     {
    438       "flag": "Alibaba affiliation evaluating Alibaba model",
    439       "detail": "Two authors are from Alibaba's Tongyi Lab and one of the four evaluated models is Qwen2.5-Coder-7B (an Alibaba product); no competing interest statement is provided."
    440     },
    441     {
    442       "flag": "HumanEval contamination unaddressed",
    443       "detail": "HumanEval is used as the sole proxy for model utility but is a well-known benchmark likely present in all evaluated models' training data; potential ceiling effects or contamination are not discussed."
    444     }
    445   ],
    446   "cited_papers": [
    447     {
    448       "title": "Large Language Model Unlearning",
    449       "relevance": "Introduces the Gradient Ascent baseline and frames the LLM unlearning problem; primary foil for PROD."
    450     },
    451     {
    452       "title": "Negative Preference Optimization: From Catastrophic Collapse to Effective Unlearning",
    453       "relevance": "NPO baseline; DPO-derived unlearning method for LLMs."
    454     },
    455     {
    456       "title": "LLM Unlearning via Loss Adjustment with Only Forget Data (FLAT)",
    457       "relevance": "Most recent baseline; f-divergence based unlearning with template responses."
    458     },
    459     {
    460       "title": "TOFU: A Task of Fictitious Unlearning for LLMs",
    461       "relevance": "Standard unlearning benchmark for natural language; provides evaluation context."
    462     },
    463     {
    464       "title": "Rethinking machine unlearning for large language models",
    465       "relevance": "Survey/position paper on LLM unlearning landscape; provides problem framing."
    466     },
    467     {
    468       "title": "Safety Alignment Should be Made More Than Just a Few Tokens Deep",
    469       "relevance": "Source of prefix injection attack methodology used for adversarial robustness evaluation."
    470     },
    471     {
    472       "title": "VersiCode: Towards version-controllable code generation",
    473       "relevance": "Dataset used for deprecated API unlearning task; benchmark on version-aware code generation."
    474     },
    475     {
    476       "title": "Purple Llama CyberSecEval: A Secure Coding Benchmark for Language Models",
    477       "relevance": "Dataset used for insecure code unlearning task; 1,916 snippets with 50 CWE vulnerabilities."
    478     }
    479   ],
    480   "engagement_factors": {
    481     "practical_relevance": {
    482       "score": 3,
    483       "justification": "Directly addresses real-world pain points (copyright infringement, security vulnerabilities, deprecated APIs) in deployed code LLMs, with released code enabling adoption."
    484     },
    485     "surprise_contrarian": {
    486       "score": 2,
    487       "justification": "The finding that state-of-the-art NLP unlearning methods catastrophically break code generation capability is genuinely surprising and challenges assumptions about method transferability."
    488     },
    489     "fear_safety": {
    490       "score": 2,
    491       "justification": "Addresses copyright infringement legal risk and security vulnerability propagation from LLMs, both active concerns for AI deployment."
    492     },
    493     "drama_conflict": {
    494       "score": 1,
    495       "justification": "Some tension with existing methods (showing they all fail), but no major community controversy; technical disagreement rather than ideological conflict."
    496     },
    497     "demo_ability": {
    498       "score": 2,
    499       "justification": "Code is released on GitHub and all datasets are publicly available, enabling practitioners to reproduce results on the three benchmark tasks."
    500     },
    501     "brand_recognition": {
    502       "score": 1,
    503       "justification": "Peking University and Alibaba Tongyi Lab are respectable institutions but not the most prominent names in the LLM unlearning community (DeepMind, OpenAI, CMU, etc.)."
    504     }
    505   },
    506   "hn_data": {
    507     "threads": [
    508       {
    509         "hn_id": "45588752",
    510         "title": "Evaluating Argon2 adoption and effectiveness in real-world software",
    511         "points": 32,
    512         "comments": 32,
    513         "url": "https://news.ycombinator.com/item?id=45588752",
    514         "created_at": "2025-10-15T06:29:18Z"
    515       },
    516       {
    517         "hn_id": "40795525",
    518         "title": "Indications of superconductivities in blend of variant apatite and covellite",
    519         "points": 26,
    520         "comments": 20,
    521         "url": "https://news.ycombinator.com/item?id=40795525",
    522         "created_at": "2024-06-26T01:20:27Z"
    523       },
    524       {
    525         "hn_id": "43736950",
    526         "title": "Show HN: LettuceDetect – Lightweight hallucination detector for RAG pipelines",
    527         "points": 10,
    528         "comments": 1,
    529         "url": "https://news.ycombinator.com/item?id=43736950",
    530         "created_at": "2025-04-19T15:12:02Z"
    531       },
    532       {
    533         "hn_id": "45147728",
    534         "title": "Contemplative Artificial Intelligence",
    535         "points": 3,
    536         "comments": 2,
    537         "url": "https://news.ycombinator.com/item?id=45147728",
    538         "created_at": "2025-09-06T09:02:21Z"
    539       },
    540       {
    541         "hn_id": "43740184",
    542         "title": "The Cambridge Report on Database Research",
    543         "points": 3,
    544         "comments": 0,
    545         "url": "https://news.ycombinator.com/item?id=43740184",
    546         "created_at": "2025-04-19T23:09:29Z"
    547       },
    548       {
    549         "hn_id": "36552652",
    550         "title": "GenAI for Programming Education: Benchmarking ChatGPT, GPT-4, and Human Tutors",
    551         "points": 2,
    552         "comments": 1,
    553         "url": "https://news.ycombinator.com/item?id=36552652",
    554         "created_at": "2023-07-01T17:36:51Z"
    555       },
    556       {
    557         "hn_id": "42892730",
    558         "title": "Using Code Generation to Solve Open Instances of Combinatorial Design Problems",
    559         "points": 2,
    560         "comments": 0,
    561         "url": "https://news.ycombinator.com/item?id=42892730",
    562         "created_at": "2025-01-31T22:03:56Z"
    563       },
    564       {
    565         "hn_id": "41471269",
    566         "title": "Unifying Multimodal Retrieval via Document Screenshot Embedding",
    567         "points": 2,
    568         "comments": 0,
    569         "url": "https://news.ycombinator.com/item?id=41471269",
    570         "created_at": "2024-09-07T03:26:45Z"
    571       },
    572       {
    573         "hn_id": "45349444",
    574         "title": "Seeing Is Deceiving:Mirror-Based Lidar Spoofing for Autonomous Vehicle Deception",
    575         "points": 1,
    576         "comments": 0,
    577         "url": "https://news.ycombinator.com/item?id=45349444",
    578         "created_at": "2025-09-23T16:39:48Z"
    579       },
    580       {
    581         "hn_id": "44096948",
    582         "title": "Reasoning Model Is Stubborn: Instruction Overriding in Reasoning Models",
    583         "points": 1,
    584         "comments": 0,
    585         "url": "https://news.ycombinator.com/item?id=44096948",
    586         "created_at": "2025-05-26T12:44:46Z"
    587       }
    588     ],
    589     "top_points": 32,
    590     "total_points": 82,
    591     "total_comments": 56
    592   }
    593 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs