scan-v4.json (29624B)
1 { 2 "scan_version": 4, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Agentic Refactoring: An Empirical Study of AI Coding Agents", 6 "authors": [ 7 "Kosei Horikawa", 8 "Hao Li", 9 "Yutaro Kashiwa", 10 "Bram Adams", 11 "Hajimu Iida", 12 "Ahmed E. Hassan" 13 ], 14 "year": 2025, 15 "venue": "arXiv", 16 "arxiv_id": "2511.04824", 17 "doi": null 18 }, 19 "checklist": { 20 "claims_and_evidence": { 21 "abstract_claims_supported": { 22 "applies": true, 23 "answer": true, 24 "justification": "Abstract claims about 26.1% refactoring rate, dominance of low-level edits, maintainability/readability motivation, and small but significant metric improvements are all supported by results in Sections 4.1-4.4.", 25 "source": "opus" 26 }, 27 "causal_claims_justified": { 28 "applies": true, 29 "answer": true, 30 "justification": "The paper is careful to use observational language ('suggests', 'indicates') and explicitly frames findings as descriptive rather than causal. The construct validity section (7.2) acknowledges the difficulty of isolating agent vs. human contributions.", 31 "source": "opus" 32 }, 33 "generalization_bounded": { 34 "applies": true, 35 "answer": true, 36 "justification": "External validity section (7.3) explicitly states limitations to OSS projects, Java language only, and the specific agents in the AIDev dataset. Caution against generalizing to closed-source or other languages is stated.", 37 "source": "opus" 38 }, 39 "alternative_explanations_discussed": { 40 "applies": true, 41 "answer": true, 42 "justification": "Section 7.2 discusses that commits labeled 'agentic' may include human modifications. Section 7.1 discusses tool limitations (RefactoringMiner false positives/negatives). The paper considers whether agents are doing 'code churn' vs. genuine improvement (Section 5.1).", 43 "source": "opus" 44 }, 45 "proxy_outcome_distinction": { 46 "applies": true, 47 "answer": true, 48 "justification": "The paper measures specific code metrics (LOC, WMC, Cyclomatic Complexity, LCOM, smell counts) and frames findings at the same granularity: 'small but statistically significant improvements in structural metrics.' It explicitly acknowledges that these metrics are proxies: Section 7.2 discusses that RefactoringMiner/DesigniteJava 'may not fully capture all aspects' and Section 5 discusses whether metric improvements translate to actual quality gains.", 49 "source": "opus" 50 } 51 }, 52 "limitations_and_scope": { 53 "limitations_section_present": { 54 "applies": true, 55 "answer": true, 56 "justification": "Dedicated Section 7 'Threats to Validity' covering internal, construct, and external validity.", 57 "source": "opus" 58 }, 59 "threats_to_validity_specific": { 60 "applies": true, 61 "answer": true, 62 "justification": "Specific threats discussed: RefactoringMiner/DesigniteJava tool accuracy (7.1), GPT-4.1-mini misclassification risk mitigated by κ = 0.77 (7.1), difficulty isolating human vs. agent contributions in commits (7.2), limitation to Java and OSS (7.3).", 63 "source": "opus" 64 }, 65 "scope_boundaries_stated": { 66 "applies": true, 67 "answer": true, 68 "justification": "Section 7.3 explicitly states scope is limited to OSS projects, Java files only, and the five agents in the AIDev dataset. Section 7.2 notes the study cannot determine exact human intervention levels.", 69 "source": "opus" 70 } 71 }, 72 "conflicts_of_interest": { 73 "funding_disclosed": { 74 "applies": true, 75 "answer": true, 76 "justification": "Acknowledgments section lists JSPS KAKENHI grants, JST PRESTO, ASPIRE, AIP Accelerated Program, and NSERC support.", 77 "source": "opus" 78 }, 79 "affiliations_disclosed": { 80 "applies": true, 81 "answer": true, 82 "justification": "Author affiliations clearly listed: NAIST (Japan) and Queen's University (Canada). No evaluated product affiliations.", 83 "source": "opus" 84 }, 85 "funder_independent_of_outcome": { 86 "applies": true, 87 "answer": true, 88 "justification": "Funders are government research agencies (JSPS, JST, NSERC) with no financial stake in the study's outcomes regarding any specific coding agent.", 89 "source": "opus" 90 }, 91 "financial_interests_declared": { 92 "applies": true, 93 "answer": false, 94 "justification": "No competing interests or financial interests statement is present in the paper.", 95 "source": "opus" 96 } 97 }, 98 "scope_and_framing": { 99 "key_terms_defined": { 100 "applies": true, 101 "answer": true, 102 "justification": "'Agentic coding tools' are defined and distinguished from prompt-based LLM workflows in Section 1; 'agentic refactoring commit' is formally defined in Section 3.2.4 with specific detection criteria; refactoring abstraction levels (high/medium/low) are precisely defined in Section 4.2.2.", 103 "source": "haiku" 104 }, 105 "intended_contribution_clear": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper explicitly claims to provide 'the first large-scale empirical baseline of agentic refactoring' and states four specific RQs (prevalence, types, purposes, impact) that define the intended contribution.", 109 "source": "haiku" 110 }, 111 "engagement_with_prior_work": { 112 "applies": true, 113 "answer": true, 114 "justification": "Section 6 contains five subsections engaging with refactoring foundations, large-scale empirical studies, automated refactoring, AI-assisted development, and agentic SE, explicitly comparing findings against Kim et al. 2014, Murphy-Hill et al. 2012, Horikawa et al. 2025, and Watanabe et al. 2025.", 115 "source": "haiku" 116 } 117 } 118 }, 119 "type_checklist": { 120 "empirical": { 121 "artifacts": { 122 "code_released": { 123 "applies": true, 124 "answer": true, 125 "justification": "Replication package provided at https://github.com/Mont9165/Agent_Refactoring_Analysis (Section 1, footnote 4).", 126 "source": "opus" 127 }, 128 "data_released": { 129 "applies": true, 130 "answer": true, 131 "justification": "The study builds on the publicly available AIDev dataset [28] and provides its own replication package with derived data.", 132 "source": "opus" 133 }, 134 "environment_specified": { 135 "applies": true, 136 "answer": false, 137 "justification": "No requirements.txt, Dockerfile, or environment specification is mentioned. Tools are named (RefactoringMiner 3.0.11, DesigniteJava, GPT-4.1-mini) but no environment setup details are provided.", 138 "source": "opus" 139 }, 140 "reproduction_instructions": { 141 "applies": true, 142 "answer": false, 143 "justification": "No step-by-step reproduction instructions are described in the paper. A replication package is linked but no README or reproduction steps are mentioned in the paper itself.", 144 "source": "opus" 145 } 146 }, 147 "statistical_methodology": { 148 "confidence_intervals_or_error_bars": { 149 "applies": true, 150 "answer": false, 151 "justification": "Results report point estimates (percentages, medians, effect sizes) but no confidence intervals or error bars are provided.", 152 "source": "opus" 153 }, 154 "significance_tests": { 155 "applies": true, 156 "answer": true, 157 "justification": "Mann-Whitney U test (RQ1), Wilcoxon signed-rank test with Benjamini-Hochberg FDR correction (RQ4), and Kruskal-Wallis tests are used throughout (Sections 4.1, 4.4).", 158 "source": "opus" 159 }, 160 "effect_sizes_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Cliff's delta reported for RQ1 (d = 0.838, large), rank-biserial effect size for RQ4, and Cohen's d for smell analysis (d = -0.027, -0.026). Median Δ values provide concrete magnitude context (e.g., Class LOC Δ = -15.25).", 164 "source": "opus" 165 }, 166 "sample_size_justified": { 167 "applies": true, 168 "answer": false, 169 "justification": "No justification for why the dataset size is sufficient or power analysis. The sample is large (14,998 commits) but no formal justification is given.", 170 "source": "opus" 171 }, 172 "variance_reported": { 173 "applies": true, 174 "answer": false, 175 "justification": "Median values are reported but no standard deviations, IQR, or other numeric spread measures are stated in tables. Box plots are shown (Figures 3, 5) but no numeric spread measures accompany the main results.", 176 "source": "opus" 177 } 178 }, 179 "evaluation_design": { 180 "baselines_included": { 181 "applies": true, 182 "answer": true, 183 "justification": "Human refactoring patterns from prior work [22, 26] are used as baselines for comparison in RQ2 and RQ3.", 184 "source": "opus" 185 }, 186 "baselines_contemporary": { 187 "applies": true, 188 "answer": true, 189 "justification": "Human refactoring baseline from Horikawa et al. [22] (2025) is contemporary. Kim et al. [26] (2014) is older but justified as the foundational source for refactoring motivation categories.", 190 "source": "opus" 191 }, 192 "ablation_study": { 193 "applies": false, 194 "answer": false, 195 "justification": "This is a mining/observational study, not a system with components to ablate.", 196 "source": "opus" 197 }, 198 "multiple_metrics": { 199 "applies": true, 200 "answer": true, 201 "justification": "Multiple code quality metrics used: LOC, WMC, Fan-In, Fan-Out, Cyclomatic Complexity, Depth of Inheritance Tree, LCOM, plus design and implementation smell counts (Section 4.4).", 202 "source": "opus" 203 }, 204 "human_evaluation": { 205 "applies": true, 206 "answer": true, 207 "justification": "Two human annotators independently labeled a stratified sample of refactoring purposes for RQ3, with Cohen's κ = 0.83 inter-rater agreement (Section 4.3.2, Table 6).", 208 "source": "opus" 209 }, 210 "held_out_test_set": { 211 "applies": false, 212 "answer": false, 213 "justification": "Not a prediction/classification study requiring train/test splits.", 214 "source": "opus" 215 }, 216 "per_category_breakdown": { 217 "applies": true, 218 "answer": true, 219 "justification": "Results broken down by refactoring abstraction level (high/medium/low), by refactoring type (Table 5), by purpose category (Figure 4), and by metric (Table 7).", 220 "source": "opus" 221 }, 222 "failure_cases_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "Finding #8 discusses that agentic refactoring fails to reduce smell counts. Finding #12 discusses that some refactoring types show no metric improvement. Section 5 discusses limitations of agents for high-level refactoring.", 226 "source": "opus" 227 }, 228 "negative_results_reported": { 229 "applies": true, 230 "answer": true, 231 "justification": "Key negative results reported: agents fail to reduce design/implementation smells (Finding #8), low-level edits may slightly increase cyclomatic complexity (Finding #10), and agents underperform humans in high-level refactoring.", 232 "source": "opus" 233 } 234 }, 235 "setup_transparency": { 236 "model_versions_specified": { 237 "applies": true, 238 "answer": true, 239 "justification": "GPT-4.1-mini is specified for classification tasks (Sections 3.2.2, 4.3.2). RefactoringMiner 3.0.11 is specified with exact version (footnote 8).", 240 "source": "opus" 241 }, 242 "prompts_provided": { 243 "applies": true, 244 "answer": false, 245 "justification": "GPT-4.1-mini is used for project classification and refactoring purpose classification, but the actual prompts used are not provided — only the categories are described.", 246 "source": "opus" 247 }, 248 "hyperparameters_reported": { 249 "applies": true, 250 "answer": false, 251 "justification": "No temperature, top-p, or other LLM hyperparameters are reported for the GPT-4.1-mini classification tasks.", 252 "source": "opus" 253 }, 254 "scaffolding_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "The study mines existing agent outputs rather than building an agentic system. The agents studied (Codex, Cursor, etc.) are third-party tools evaluated as black boxes.", 258 "source": "opus" 259 }, 260 "data_preprocessing_documented": { 261 "applies": true, 262 "answer": true, 263 "justification": "Multi-stage filtering pipeline is thoroughly documented with counts at each stage: 1.3M commits → Java filter → project filtering (automated + manual) → 1,613 repos, 14,998 commits → RefactoringMiner → 5,789 refactoring commits → SAR patterns → 3,907 agentic refactoring commits (Sections 3.1-3.2).", 264 "source": "opus" 265 } 266 }, 267 "data_integrity": { 268 "raw_data_available": { 269 "applies": true, 270 "answer": true, 271 "justification": "Replication package at https://github.com/Mont9165/Agent_Refactoring_Analysis is provided, and the study builds on the publicly available AIDev dataset.", 272 "source": "opus" 273 }, 274 "data_collection_described": { 275 "applies": true, 276 "answer": true, 277 "justification": "Section 3 thoroughly describes data mining from AIDev dataset, GitHub REST API commit collection, and multi-stage filtering. Source dataset, API endpoints, and filtering criteria are all specified.", 278 "source": "opus" 279 }, 280 "recruitment_methods_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants recruited. The study mines public GitHub repositories from an existing dataset.", 284 "source": "opus" 285 }, 286 "data_pipeline_documented": { 287 "applies": true, 288 "answer": true, 289 "justification": "Full pipeline documented in Section 3 and Figure 2: AIDev dataset → GitHub API mining → Java file filtering → project classification (GPT-4.1-mini + manual review) → fork removal → RefactoringMiner → SAR pattern matching, with counts at each stage.", 290 "source": "opus" 291 } 292 }, 293 "contamination": { 294 "training_cutoff_stated": { 295 "applies": false, 296 "answer": false, 297 "justification": "This is a mining study analyzing existing agent-generated commits. It does not evaluate a pre-trained model's capability on any benchmark.", 298 "source": "opus" 299 }, 300 "train_test_overlap_discussed": { 301 "applies": false, 302 "answer": false, 303 "justification": "Not a benchmark evaluation study. The study mines and analyzes existing code contributions.", 304 "source": "opus" 305 }, 306 "benchmark_contamination_addressed": { 307 "applies": false, 308 "answer": false, 309 "justification": "Not a benchmark evaluation study.", 310 "source": "opus" 311 } 312 }, 313 "human_studies": { 314 "pre_registered": { 315 "applies": false, 316 "answer": false, 317 "justification": "No human participants. The study mines public repositories; human annotators validated LLM classifications but are not study subjects.", 318 "source": "opus" 319 }, 320 "irb_or_ethics_approval": { 321 "applies": false, 322 "answer": false, 323 "justification": "No human participants in the study.", 324 "source": "opus" 325 }, 326 "demographics_reported": { 327 "applies": false, 328 "answer": false, 329 "justification": "No human participants. Annotators' experience levels are noted (7 and 17 years) but they are not study subjects.", 330 "source": "opus" 331 }, 332 "inclusion_exclusion_criteria": { 333 "applies": false, 334 "answer": false, 335 "justification": "No human participants.", 336 "source": "opus" 337 }, 338 "randomization_described": { 339 "applies": false, 340 "answer": false, 341 "justification": "No human participants.", 342 "source": "opus" 343 }, 344 "blinding_described": { 345 "applies": false, 346 "answer": false, 347 "justification": "No human participants.", 348 "source": "opus" 349 }, 350 "attrition_reported": { 351 "applies": false, 352 "answer": false, 353 "justification": "No human participants.", 354 "source": "opus" 355 } 356 }, 357 "cost_and_practicality": { 358 "inference_cost_reported": { 359 "applies": true, 360 "answer": false, 361 "justification": "The study uses GPT-4.1-mini for classification of thousands of commits and projects but does not report the API cost or token consumption.", 362 "source": "opus" 363 }, 364 "compute_budget_stated": { 365 "applies": true, 366 "answer": false, 367 "justification": "No mention of total compute budget for running RefactoringMiner, DesigniteJava, or GPT-4.1-mini across the dataset.", 368 "source": "opus" 369 } 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "Refactoring appears in 26.1% of agentic Java commits (3,907 of 14,998), making it a common and intentional activity in agentic software development.", 376 "evidence": "Table 3 shows 3,907 agentic refactoring commits out of 14,998 total; Mann-Whitney U test confirms significantly more instances when refactoring intent is explicit (p≤0.001, Cliff's d=0.838).", 377 "supported": "strong" 378 }, 379 { 380 "claim": "Agentic refactoring is dominated by low-level edits (35.8%) more than human refactoring (24.4%), with fewer high-level structural changes (43.0% vs 54.9% for humans).", 381 "evidence": "Table 4 compares abstraction level distributions between agents and humans (Horikawa et al. 2025 baseline), with top-3 per level in Table 5 showing agents over-index on Change Variable Type (11.8%), Rename Parameter (10.4%), Rename Variable (8.5%).", 382 "supported": "strong" 383 }, 384 { 385 "claim": "Agentic refactoring is overwhelmingly motivated by maintainability (52.5%) and readability (28.1%), contrasting with human refactoring where these are less dominant.", 386 "evidence": "Figure 4 shows purpose distribution with GPT-4.1-mini classification validated at κ=0.77 against human labels; human comparison from Kim et al. 2014 shows maintainability only 11.7% for humans.", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "Agentic refactoring yields statistically significant but small structural improvements, particularly for medium-level changes (Class LOC median Δ = -15.25, WMC median Δ = -2.07).", 391 "evidence": "Table 7 reports Wilcoxon signed-rank tests with FDR correction showing significance, but rank-biserial effect sizes are described as negligible-to-small; medium-level refactorings show the largest improvements.", 392 "supported": "strong" 393 }, 394 { 395 "claim": "Agentic refactoring fails to consistently reduce design and implementation smell counts (median Δ = 0.00 for both).", 396 "evidence": "Figure 5 shows near-complete overlap in before/after smell distributions; median Design Smell Δ = 0.00, median Implementation Smell Δ = 0.00 despite statistically significant Wilcoxon test (negligible effect: Cohen's d = -0.027).", 397 "supported": "strong" 398 }, 399 { 400 "claim": "OpenAI Codex dominates agentic refactoring at 89.3% of commits, making results primarily representative of Codex behavior.", 401 "evidence": "Table 2 shows Codex accounts for 13,389/14,998 commits (89.3%) and 11,557/12,256 PRs (94.3%); Claude Code represents only 86 commits (0.6%).", 402 "supported": "strong" 403 } 404 ], 405 "methodology_tags": [ 406 "observational" 407 ], 408 "key_findings": "Agentic coding agents refactor code in 26.1% of commits, but their refactoring skews heavily toward low-level consistency edits (renaming, type changes at 35.8% vs. 24.4% for humans) rather than high-level architectural restructuring (43.0% vs. 54.9%). Motivations are overwhelmingly internal quality-focused (maintainability 52.5%, readability 28.1%), but quantitative outcomes are disappointing: while medium-level refactorings show statistically significant structural improvements (Class LOC median Δ = -15.25), design and implementation smell counts show virtually no change (median Δ = 0.00 for both), indicating agents function as incremental cleanup tools rather than architectural redesigners. The dataset is 89.3% OpenAI Codex commits, substantially limiting generalizability to other agents.", 409 "red_flags": [ 410 { 411 "flag": "Codex monoculture", 412 "detail": "OpenAI Codex accounts for 89.3% of commits and 94.3% of PRs. Claude Code has only 86 commits (0.6%). The paper presents findings as 'agentic refactoring' in general but is effectively a Codex-specific study." 413 }, 414 { 415 "flag": "Dated human baseline", 416 "detail": "The primary purpose comparison (RQ3) uses Kim et al. 2014 — an 11-year-old survey of Microsoft Windows developers — as the human refactoring baseline. The context difference (industrial closed-source 2014 vs. OSS AI-assisted 2024-25) makes the comparison questionable." 417 }, 418 { 419 "flag": "GPT-4.1-mini prompts unreported", 420 "detail": "GPT-4.1-mini is used for both project classification (production_grade vs. toy) and refactoring purpose classification across thousands of commits, but no prompts are provided in the paper or replication package description, making independent validation impossible." 421 }, 422 { 423 "flag": "No environment specification", 424 "detail": "No Dockerfile, requirements.txt, or API version snapshot is provided for the full pipeline; partial reproduction relies on knowing the correct DesigniteJava and GPT-4.1-mini API configuration." 425 }, 426 { 427 "flag": "Commit-level conflation", 428 "detail": "Section 7.2 acknowledges that 'agentic commits' may contain human modifications, but the construct validity threat is underexplored — the extent of human-AI co-authorship within individual commits cannot be determined from commit metadata alone." 429 } 430 ], 431 "cited_papers": [ 432 { 433 "title": "The Rise of AI Teammates in Software Engineering (SE) 3.0: How Autonomous Coding Agents Are Reshaping Software Engineering (AIDev dataset)", 434 "relevance": "Source dataset for the study — 932K+ PRs from 5 coding agents across 61K+ repositories; foundational for agentic SE research" 435 }, 436 { 437 "title": "RefactoringMiner 2.0", 438 "relevance": "Core detection tool used to identify 103 refactoring types with 99.5% F-score; enables large-scale automated refactoring mining" 439 }, 440 { 441 "title": "An Empirical Study of Refactoring Challenges and Benefits at Microsoft (Kim et al. 2014)", 442 "relevance": "Human refactoring purpose baseline for RQ3; establishes the 9-category motivation taxonomy adapted by this paper" 443 }, 444 { 445 "title": "How We Refactor, and How We Know It (Murphy-Hill et al. 2012)", 446 "relevance": "Provides the refactoring abstraction level framework (high/medium/low) extended by this paper to classify 103 RefactoringMiner types" 447 }, 448 { 449 "title": "Can refactoring be self-affirmed? (AlOmar et al. 2019)", 450 "relevance": "Source of the 87-pattern Self-Affirmed Refactoring (SAR) keyword list used to identify agentic refactoring commits" 451 }, 452 { 453 "title": "Understanding the impact of refactoring on smells: a longitudinal study of 23 software projects (Cedrim et al. 2017)", 454 "relevance": "Prior finding that <10% of refactorings remove code smells contextualizes this paper's Finding #8 on smell count stability" 455 }, 456 { 457 "title": "On the Use of Agentic Coding: An Empirical Study of Pull Requests on GitHub (Watanabe et al. 2025)", 458 "relevance": "Most directly related work — finds 45.1% of agentic PRs need post-review fixes and agents refactor more than humans; companion study" 459 }, 460 { 461 "title": "Agentic Software Engineering: Foundational Pillars and a Research Roadmap (Hassan et al. 2025)", 462 "relevance": "Defines the agentic software engineering paradigm within which this empirical study is situated" 463 }, 464 { 465 "title": "Multi-faceted Code Smell Detection at Scale using DesigniteJava 2.0 (Sharma 2024)", 466 "relevance": "Tool used to detect 27 design/implementation smells and extract OO metrics for the before-after quality analysis" 467 }, 468 { 469 "title": "How Does Test Code Differ From Production Code in Terms of Refactoring? An Empirical Study (Horikawa et al. 2025)", 470 "relevance": "Contemporary human refactoring type distribution baseline used for RQ2 comparison" 471 } 472 ], 473 "engagement_factors": { 474 "practical_relevance": { 475 "score": 2, 476 "justification": "Directly informs developers on what to delegate to AI agents (low-level cleanup) vs. handle themselves (architectural refactoring)." 477 }, 478 "surprise_contrarian": { 479 "score": 1, 480 "justification": "The finding that agents fail to reduce code smells despite refactoring is mildly surprising, but the dominance of low-level edits is largely expected." 481 }, 482 "fear_safety": { 483 "score": 0, 484 "justification": "No safety, security, or risk angle is present in this study." 485 }, 486 "drama_conflict": { 487 "score": 1, 488 "justification": "Mildly questions the value proposition of AI coding agents by showing they produce negligible quality improvements and mostly do cosmetic cleanup." 489 }, 490 "demo_ability": { 491 "score": 1, 492 "justification": "Replication package exists on GitHub but requires RefactoringMiner, DesigniteJava, and significant setup to reproduce." 493 }, 494 "brand_recognition": { 495 "score": 2, 496 "justification": "Study directly analyzes OpenAI Codex, Claude Code, Cursor, and Devin — well-known products in the developer tools space." 497 } 498 }, 499 "hn_data": { 500 "threads": [ 501 { 502 "hn_id": "33795122", 503 "title": "No Privacy in the Electronics Repair Industry", 504 "points": 173, 505 "comments": 131, 506 "url": "https://news.ycombinator.com/item?id=33795122", 507 "created_at": "2022-11-30T00:02:16Z" 508 }, 509 { 510 "hn_id": "46902855", 511 "title": "Psychometric Jailbreaks Reveal Internal Conflict in Frontier Models", 512 "points": 68, 513 "comments": 60, 514 "url": "https://news.ycombinator.com/item?id=46902855", 515 "created_at": "2026-02-05T18:21:53Z" 516 }, 517 { 518 "hn_id": "45823358", 519 "title": "Kosmos: An AI Scientist for Autonomous Discovery", 520 "points": 60, 521 "comments": 20, 522 "url": "https://news.ycombinator.com/item?id=45823358", 523 "created_at": "2025-11-05T14:43:26Z" 524 }, 525 { 526 "hn_id": "10581137", 527 "title": "Neural Programmer: Inducing Latent Programs with Gradient Descent [pdf]", 528 "points": 59, 529 "comments": 21, 530 "url": "https://news.ycombinator.com/item?id=10581137", 531 "created_at": "2015-11-17T14:15:58Z" 532 }, 533 { 534 "hn_id": "46207995", 535 "title": "Psychometric Jailbreaks Reveal Internal Conflict in Frontier Models", 536 "points": 4, 537 "comments": 0, 538 "url": "https://news.ycombinator.com/item?id=46207995", 539 "created_at": "2025-12-09T17:46:24Z" 540 }, 541 { 542 "hn_id": "46358753", 543 "title": "Psychometric Jailbreaks Reveal Internal Conflict in Frontier Models", 544 "points": 2, 545 "comments": 0, 546 "url": "https://news.ycombinator.com/item?id=46358753", 547 "created_at": "2025-12-22T20:38:00Z" 548 }, 549 { 550 "hn_id": "42258010", 551 "title": "Gradient Boosting Trees and LLMs for Tabular Data Few-Shot Learning", 552 "points": 2, 553 "comments": 0, 554 "url": "https://news.ycombinator.com/item?id=42258010", 555 "created_at": "2024-11-27T17:46:47Z" 556 }, 557 { 558 "hn_id": "42150576", 559 "title": "WiFlexFormer: Efficient WiFi-Based Person-Centric Sensing", 560 "points": 2, 561 "comments": 0, 562 "url": "https://news.ycombinator.com/item?id=42150576", 563 "created_at": "2024-11-15T20:27:07Z" 564 }, 565 { 566 "hn_id": "45873709", 567 "title": "The Drain of Scientific Publishing", 568 "points": 1, 569 "comments": 0, 570 "url": "https://news.ycombinator.com/item?id=45873709", 571 "created_at": "2025-11-10T08:21:43Z" 572 }, 573 { 574 "hn_id": "46559629", 575 "title": "When AI Takes the Couch: Internal Conflict in Frontier Models", 576 "points": 1, 577 "comments": 0, 578 "url": "https://news.ycombinator.com/item?id=46559629", 579 "created_at": "2026-01-09T21:29:20Z" 580 } 581 ], 582 "top_points": 173, 583 "total_points": 372, 584 "total_comments": 232 585 } 586 }