scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28850B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Agentic Refactoring: An Empirical Study of AI Coding Agents",
      6     "authors": [
      7       "Kosei Horikawa",
      8       "Hao Li",
      9       "Yutaro Kashiwa",
     10       "Bram Adams",
     11       "Hajimu Iida",
     12       "Ahmed E. Hassan"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv",
     16     "arxiv_id": "2511.04824",
     17     "doi": "XXXX XXX.XXXXXXX"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All four major abstract claims are directly supported: 26.1% refactoring rate (Table 3), low-level dominance (Table 4-5), maintainability/readability motivation (Figure 4), and small structural improvements (Table 7).",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper claims agentic refactoring 'yields improvements' in structural metrics using before-after comparison, but this observational design cannot establish causality — the refactoring commit itself may co-occur with other changes that affect metrics.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Section 7.3 explicitly bounds generalization: results are limited to OSS Java projects from the AIDev dataset, and 'caution should be exercised when generalizing our results to other contexts' including industrial projects and other languages.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The finding that agents perform more low-level refactoring could reflect task assignment patterns (developers assign low-level cleanup to agents), repository characteristics, or AIDev dataset composition (89.3% Codex), but these alternatives are not systematically explored.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Finding #12 explicitly acknowledges that structural metrics do not capture readability or naming benefits: 'their main benefits (e.g., readability, naming consistency, API clarity) are not captured by the selected design-level indicators.'",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 7 'Threats to Validity' has dedicated subsections for Internal Validity (7.1), Construct Validity (7.2), and External Validity (7.3).",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Threats are specific: RefactoringMiner false positives/negatives, GPT-4.1-mini misclassification risk mitigated by kappa=0.77 validation, ambiguity of 'agentic commit' definition with unknown human intervention extent.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Paper explicitly states results are bounded to Java OSS projects and that 'development practices, coding standards, and types of refactoring in industrial, closed-source projects may differ significantly.'",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Acknowledgments lists specific grants: JSPS KAKENHI (JP24K02921, JP25K21359), JST PRESTO (JPMJPR22P3), ASPIRE (JPMJAP2415), AIP Accelerated Program (JPMJCR25U7), and NSERC.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All six authors list their affiliations: Nara Institute of Science and Technology (Japan) and Queen's University (Canada), with contact emails provided.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "All funders (JSPS, JST, NSERC) are government/academic agencies with no financial interest in the commercial coding agents (Codex, Claude Code, Cursor, Devin) evaluated in the study.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement appears anywhere in the paper; only funding acknowledgment is provided.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Refactoring is defined via Opdyke; 'agentic refactoring commit' is operationally defined as RefactoringMiner detection plus SAR keyword in commit message; the three abstraction levels (high/medium/low) are precisely defined with examples.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper explicitly frames its contribution as 'the first large-scale empirical baseline of agentic refactoring' answering four RQs on prevalence, types, purposes, and impact.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 6 contains substantive engagement with prior work across four areas, directly comparing agent findings to human refactoring data from Kim et al. [26] and Horikawa et al. [22], and positioning against automated refactoring literature.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "A replication package is provided at https://github.com/Mont9165/Agent_Refactoring_Analysis, referenced multiple times including for the refactoring level classification mapping.",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "The study uses the publicly available AIDev dataset [28], and the authors' derived analysis data is available in the replication package on GitHub.",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "RefactoringMiner 3.0.11 and DesigniteJava versions are mentioned, and GPT-4.1-mini is named, but no requirements file, Dockerfile, or dependency specification is provided.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "A replication package exists but the paper provides no step-by-step instructions for reproducing the analysis pipeline from raw data to findings.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "The paper reports effect sizes (Cliff's delta, rank-biserial) and p-values but does not provide confidence intervals for key proportions (e.g., the 26.1% refactoring rate) or median deltas.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "Mann-Whitney U test is used for RQ1, Wilcoxon signed-rank tests with Benjamini-Hochberg FDR adjustment for RQ4, and Kruskal-Wallis tests for cross-group comparisons.",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Cliff's delta with explicit thresholds (negligible/small/medium/large) for RQ1, rank-biserial effect sizes for RQ4, Cohen's kappa for inter-rater reliability, and Cohen's d for smell changes.",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "The 14,998 commit sample size results from dataset filtering steps, not from a power analysis or a priori justification for statistical adequacy.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "Table 7 reports only median delta values without IQR or standard deviations; Figure 3 shows distributions visually but numeric spread is not reported for key findings.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Human refactoring patterns from Horikawa et al. [22] (abstraction levels) and Kim et al. [26] (purposes) serve as explicit baselines for comparison throughout RQ2 and RQ3.",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": false,
    189           "justification": "The primary purpose baseline (Kim et al. 2014) is over a decade old and comes from Microsoft developers, not from open-source projects; the abstraction-level baseline (Horikawa et al. 2025) is more recent.",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": false,
    194           "answer": false,
    195           "justification": "This is an observational mining study with no system components to ablate.",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "RQ4 uses multiple metrics: Class LOC, WMC, Fan-In, Fan-Out, DIT, Number of Methods (class-level) and Parameter Count, Cyclomatic Complexity, Method LOC (method-level), plus 27 design/implementation smell counts.",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Two human annotators with seven years of programming experience independently labeled a stratified sample of commits for refactoring purpose, achieving Cohen's kappa=0.83 inter-rater agreement.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": false,
    212           "answer": false,
    213           "justification": "This is an observational mining study, not a prediction task requiring a held-out test set.",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Results are broken down by refactoring abstraction level (high/medium/low) in Table 7, by purpose category in Figure 4, and by AI agent type in Table 2.",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Finding #8 and #12 explicitly discuss where agents fail: negligible smell reduction (median Δ=0.00), high-frequency types like identifier renames showing 'negligible before-and-after change', and Move And Inline Method sometimes increasing complexity.",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Finding #8 is an explicit negative result: design and implementation smell counts show no practical improvement (median Δ=0.00) despite statistically significant differences, with negligible effect sizes (Cohen's d=-0.027).",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "GPT-4.1-mini is specified for purpose classification; RefactoringMiner 3.0.11 is specified; DesigniteJava 2.0 is referenced via citation [49].",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": false,
    245           "justification": "GPT-4.1-mini is used to classify refactoring purposes and classify repositories, but the actual prompts given to the model are not provided in the paper or referenced in the replication package.",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": false,
    251           "justification": "No temperature, top-p, or other inference hyperparameters are reported for GPT-4.1-mini usage in either repository classification or refactoring purpose classification.",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "The paper studies outputs of third-party agentic tools (Codex, Devin, Cursor, Claude Code) from the AIDev dataset; the authors do not deploy or control any scaffolding themselves.",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Section 3 details the full multi-stage pipeline: Java file filtering, toy project classification via GPT-4.1-mini with manual verification, fork removal, RefactoringMiner application, and SAR keyword identification with the complete 87-pattern list in Table 1.",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": true,
    271           "justification": "The replication package at GitHub includes the derived analysis data, and the source AIDev dataset [28] is a published public dataset.",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Section 3 describes the full collection process: starting from AIDev's 932,791 PRs across 61,000+ repos, using GitHub REST API to collect 1,311,057 commits, then applying multi-stage filtering down to 14,998 commits.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": false,
    282           "answer": false,
    283           "justification": "No human participant recruitment — this is a mining study of public GitHub commit data.",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "Figure 2 provides a full visual overview of the pipeline from AIDev mining through filtering to each RQ analysis, with detailed step-by-step descriptions in Sections 3.2.1–3.2.5.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": false,
    296           "answer": false,
    297           "justification": "The paper does not evaluate LLM capabilities on benchmarks — GPT-4.1-mini is used as a classifier for human-labeled categories, not tested on held-out capability benchmarks.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": false,
    302           "answer": false,
    303           "justification": "Not applicable — the paper studies AI agent commit outputs in practice, not model benchmark performance.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": false,
    308           "answer": false,
    309           "justification": "Not applicable — no benchmark evaluation of model capabilities.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No human participants — mining study of GitHub commits. The two human annotators for validation are internal quality checks, not research participants.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participant research requiring IRB.",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participant study.",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participant study.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participant study.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participant study.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participant study.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "GPT-4.1-mini is used to classify 3,907+ commits and 1,613 repositories, but no inference cost or latency is reported.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "No total computational budget is stated for running RefactoringMiner on 14,998 commits or DesigniteJava on the before/after states of all agentic refactoring commits.",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "Refactoring is common in agentic software development, appearing in 26.1% of agentic Java commits (3,907 of 14,998).",
    376       "evidence": "Table 3 directly reports the counts: 3,907 agentic refactoring commits out of 14,998 total. Mann-Whitney U test confirms these commits contain significantly more refactoring instances (Cliff's d=0.838, large effect).",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Agentic refactoring is dominated by low-level edits (35.8%) more than human refactoring (24.4%), while agents perform fewer high-level structural changes (43.0% vs 54.9% for humans).",
    381       "evidence": "Table 4 shows the abstraction-level distribution comparison; Table 5 shows the top three types per level for agents vs. humans from Horikawa et al. [22].",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Agentic refactoring is overwhelmingly motivated by maintainability (52.5%) and readability (28.1%), together accounting for over 80% of cases.",
    386       "evidence": "Figure 4 shows the purpose distribution; GPT-4.1-mini classification validated with Cohen's kappa=0.77 against human labels on a stratified sample.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Agentic refactoring yields statistically significant but practically small structural improvements, most notably for medium-level changes (Class LOC median Δ=-15.25, WMC median Δ=-2.07).",
    391       "evidence": "Table 7 reports per-level median deltas with FDR-adjusted Wilcoxon signed-rank significance; effect sizes described as negligible-to-small.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "Agentic refactoring fails to consistently reduce design and implementation smell counts despite explicit refactoring intent.",
    396       "evidence": "Figure 5 shows nearly identical before/after smell distributions; median Δ=0.00 for both design and implementation smells; Cohen's d=-0.027 and -0.026 (negligible).",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "OpenAI Codex dominates the dataset at 89.3% of commits, making findings largely specific to one agent.",
    401       "evidence": "Table 2 reports agent distribution: Codex 13,389 commits (89.3%), Devin 860 (5.7%), Cursor 663 (4.4%), Claude Code 86 (0.6%).",
    402       "supported": "strong"
    403     }
    404   ],
    405   "methodology_tags": [
    406     "observational"
    407   ],
    408   "key_findings": "This large-scale mining study of 15,451 agentic refactoring instances from 14,998 Java commits shows AI agents actively participate in refactoring (26.1% of commits), but their efforts are concentrated on low-level, consistency-oriented edits (renaming, type changes) at 35.8% vs. 24.4% for humans, driven overwhelmingly by maintainability (52.5%) and readability (28.1%) rather than design concerns. Structural metrics show small but statistically significant improvements for medium-level refactorings (Class LOC median Δ=-15.25), but design and implementation smell counts show negligible change (median Δ=0.00, Cohen's d<0.03), indicating agents serve as incremental cleanup partners rather than architectural restructurers. Results are heavily skewed by OpenAI Codex (89.3% of commits), limiting generalizability to agentic coding broadly.",
    409   "red_flags": [
    410     {
    411       "flag": "Single-agent dominance",
    412       "detail": "OpenAI Codex accounts for 89.3% of all commits and 94.3% of PRs in the dataset; Claude Code contributes only 0.6%. Findings presented as 'agentic' behavior are almost entirely Codex-specific and may not generalize."
    413     },
    414     {
    415       "flag": "Decade-old human baseline",
    416       "detail": "The primary comparison for refactoring purposes uses Kim et al. 2014 data from Microsoft developers, which may not reflect current open-source developer behavior. Cross-ecosystem and cross-decade comparison weakens the contrast claims."
    417     },
    418     {
    419       "flag": "Prompts not disclosed",
    420       "detail": "GPT-4.1-mini is used to classify both repository type and refactoring purpose for thousands of commits, but the actual prompts are not provided. This prevents independent validation of the classification approach."
    421     },
    422     {
    423       "flag": "No confidence intervals",
    424       "detail": "Key proportions (26.1% refactoring rate, 52.5% maintainability motivation) are reported as point estimates without confidence intervals, making uncertainty about the true population rates unclear."
    425     },
    426     {
    427       "flag": "Before-after causality conflation",
    428       "detail": "The 'impact on code quality' analysis compares metrics before and after refactoring commits, but these commits may contain mixed changes (tangled commits); Finding #1 acknowledges 53.9% of refactoring instances occur in non-refactoring commits."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "The Rise of AI Teammates in Software Engineering (SE) 3.0: How Autonomous Coding Agents Are Reshaping Software Engineering",
    434       "relevance": "Source of the AIDev dataset (932,791 PRs across 61,000+ repos) used as the primary data source for this study."
    435     },
    436     {
    437       "title": "RefactoringMiner 2.0",
    438       "relevance": "Core tool for detecting 103 refactoring types in Java commits, achieving 99.5% F-score; central to the methodology."
    439     },
    440     {
    441       "title": "An Empirical Study of Refactoring Challenges and Benefits at Microsoft",
    442       "relevance": "Provides the human refactoring purpose baseline used throughout RQ3 comparison; primary external comparison dataset."
    443     },
    444     {
    445       "title": "Understanding the impact of refactoring on smells: a longitudinal study of 23 software projects",
    446       "relevance": "Prior finding that <10% of refactorings remove smells and >30% introduce new ones; contextualizes the smell non-reduction finding."
    447     },
    448     {
    449       "title": "How We Refactor, and How We Know It",
    450       "relevance": "Establishes the three abstraction levels (high/medium/low) framework used to classify refactoring types in RQ2."
    451     },
    452     {
    453       "title": "On the Use of Agentic Coding: An Empirical Study of Pull Requests on GitHub",
    454       "relevance": "Directly related work on agentic PRs showing 45.1% required post-review fixes; provides context for the broader agentic coding landscape."
    455     },
    456     {
    457       "title": "Using AI-based coding assistants in practice: State of affairs, perceptions, and ways forward",
    458       "relevance": "Reports that 21.9% of developers avoid AI for refactoring due to correctness concerns; motivates studying agentic refactoring adoption."
    459     }
    460   ],
    461   "engagement_factors": {
    462     "practical_relevance": {
    463       "score": 3,
    464       "justification": "Directly actionable: tells developers what to delegate (low-level cleanup) vs. retain (architectural changes) when using coding agents like Codex, Claude Code, or Cursor."
    465     },
    466     "surprise_contrarian": {
    467       "score": 2,
    468       "justification": "The finding that agentic refactoring fails to reduce design smells despite explicit maintainability intent — and that structural improvements are negligible — challenges the narrative that AI agents improve code quality."
    469     },
    470     "fear_safety": {
    471       "score": 0,
    472       "justification": "No AI safety or risk concerns raised; the study evaluates code quality improvements, not safety-critical behaviors."
    473     },
    474     "drama_conflict": {
    475       "score": 1,
    476       "justification": "Mild conflict angle: study implicitly questions whether 86.9% PR merge rate reflects real quality improvement or developer over-trust in agent-generated refactorings."
    477     },
    478     "demo_ability": {
    479       "score": 2,
    480       "justification": "The AIDev dataset and replication package are publicly available; practitioners can immediately explore the data and verify findings."
    481     },
    482     "brand_recognition": {
    483       "score": 2,
    484       "justification": "Explicitly studies Claude Code, OpenAI Codex, Cursor, and Devin — all recognizable commercial products with substantial user bases."
    485     }
    486   },
    487   "hn_data": {
    488     "threads": [
    489       {
    490         "hn_id": "33795122",
    491         "title": "No Privacy in the Electronics Repair Industry",
    492         "points": 173,
    493         "comments": 131,
    494         "url": "https://news.ycombinator.com/item?id=33795122",
    495         "created_at": "2022-11-30T00:02:16Z"
    496       },
    497       {
    498         "hn_id": "46902855",
    499         "title": "Psychometric Jailbreaks Reveal Internal Conflict in Frontier Models",
    500         "points": 68,
    501         "comments": 60,
    502         "url": "https://news.ycombinator.com/item?id=46902855",
    503         "created_at": "2026-02-05T18:21:53Z"
    504       },
    505       {
    506         "hn_id": "45823358",
    507         "title": "Kosmos: An AI Scientist for Autonomous Discovery",
    508         "points": 60,
    509         "comments": 20,
    510         "url": "https://news.ycombinator.com/item?id=45823358",
    511         "created_at": "2025-11-05T14:43:26Z"
    512       },
    513       {
    514         "hn_id": "10581137",
    515         "title": "Neural Programmer: Inducing Latent Programs with Gradient Descent [pdf]",
    516         "points": 59,
    517         "comments": 21,
    518         "url": "https://news.ycombinator.com/item?id=10581137",
    519         "created_at": "2015-11-17T14:15:58Z"
    520       },
    521       {
    522         "hn_id": "46207995",
    523         "title": "Psychometric Jailbreaks Reveal Internal Conflict in Frontier Models",
    524         "points": 4,
    525         "comments": 0,
    526         "url": "https://news.ycombinator.com/item?id=46207995",
    527         "created_at": "2025-12-09T17:46:24Z"
    528       },
    529       {
    530         "hn_id": "46358753",
    531         "title": "Psychometric Jailbreaks Reveal Internal Conflict in Frontier Models",
    532         "points": 2,
    533         "comments": 0,
    534         "url": "https://news.ycombinator.com/item?id=46358753",
    535         "created_at": "2025-12-22T20:38:00Z"
    536       },
    537       {
    538         "hn_id": "42258010",
    539         "title": "Gradient Boosting Trees and LLMs for Tabular Data Few-Shot Learning",
    540         "points": 2,
    541         "comments": 0,
    542         "url": "https://news.ycombinator.com/item?id=42258010",
    543         "created_at": "2024-11-27T17:46:47Z"
    544       },
    545       {
    546         "hn_id": "42150576",
    547         "title": "WiFlexFormer: Efficient WiFi-Based Person-Centric Sensing",
    548         "points": 2,
    549         "comments": 0,
    550         "url": "https://news.ycombinator.com/item?id=42150576",
    551         "created_at": "2024-11-15T20:27:07Z"
    552       },
    553       {
    554         "hn_id": "45873709",
    555         "title": "The Drain of Scientific Publishing",
    556         "points": 1,
    557         "comments": 0,
    558         "url": "https://news.ycombinator.com/item?id=45873709",
    559         "created_at": "2025-11-10T08:21:43Z"
    560       },
    561       {
    562         "hn_id": "46559629",
    563         "title": "When AI Takes the Couch: Internal Conflict in Frontier Models",
    564         "points": 1,
    565         "comments": 0,
    566         "url": "https://news.ycombinator.com/item?id=46559629",
    567         "created_at": "2026-01-09T21:29:20Z"
    568       }
    569     ],
    570     "top_points": 173,
    571     "total_points": 372,
    572     "total_comments": 232
    573   }
    574 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs