scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28130B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "How Safe Are AI-Generated Patches? A Large-scale Study on Security Risks in LLM and Agentic Automated Program Repair on SWE-bench",
      6     "authors": [
      7       "Amirali Sajadi",
      8       "Kostadin Damevski",
      9       "Preetha Chatterjee"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv",
     13     "arxiv_id": "2507.02976",
     14     "doi": "XXXXXXX.XXXXXXX"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All major abstract claims are substantiated: 11x more vulnerabilities for Llama vs developers (135 vs 12), agentic frameworks also introduce vulnerabilities (OH=44, ACR=3, HC=2), and RQ3 identifies code/issue-level factors associated with vulnerable patches.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper uses observational/associative design (Cliff's delta, Spearman's rho, chi-squared) but includes causal language such as 'Llama introduces' and 'increasing LLM autonomy can further amplify vulnerability risks'; no causal study design (e.g., randomization, counterfactual) is used to support these framings.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The paper explicitly scopes conclusions to the tested systems (one LLM, three frameworks) and SWE-bench's Python-only repositories; the threats section states 'we acknowledge the limitation in terms of data scale' and avoids cross-setting comparisons.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper does not discuss that oracle retrieval (giving Llama the exact files developers used) or the full-file vs. diff generation paradigm could systematically inflate or shift vulnerability patterns relative to developers; only one interpretation per finding is presented.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper explicitly distinguishes static analysis flags from confirmed vulnerabilities through a three-step pipeline (multi-tool aggregation, majority voting, manual inspection), and distinguishes 'new' from 'persisting' vulnerabilities at the file level.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 4 'Threats to Validity' is a dedicated section covering static analysis limitations, model/framework scope, dataset scale, and annotation reliability.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Threats are specific: false positives addressed with named tools and majority voting; agentic framework cost cited as '$0.70 per instance' making full train-set runs '$13,000+' infeasible; manual annotation limited to vulnerable instances with inter-rater kappa >0.9 reported.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper explicitly states 'we therefore refrain from making strong statistical claims about the agentic systems' and 'avoid any claims of statistical superiority or inferiority between Llama and the frameworks' due to different evaluation splits.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding acknowledgment or grant information appears anywhere in the paper text; absence of any funding statement means this criterion is not met.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors list their institutional affiliations (Drexel University and Virginia Commonwealth University) on the title page.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "Funding is not disclosed, so independence of funder cannot be assessed.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement, patent disclosure, or equity declaration appears in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "APR is defined as agents that 'identify, diagnose, and patch software bugs directly in real-world projects'; new vs. persisting vulnerabilities are explicitly defined; CWE categories are named; vulnerability detection pipeline is operationally defined.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper explicitly frames its contribution as 'the first large-scale security analysis of LLM-generated patches using 20,000+ GitHub issues' and states three concrete research questions with clear scope.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 6 'Related Work' explicitly distinguishes this study from prior adversarial-focused work (Przymus et al., Chen et al.) and prior snippet-based security analyses (Pearce et al., Khoury et al.), positioning the contribution as first large-scale non-adversarial analysis.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "Reference [3] is a live figshare replication package URL (https://figshare.com/s/174a976de48f28ae1482?file=57891562) described as containing 'data and research artifacts publicly accessible.'",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "SWE-bench is a public benchmark, and the replication package on figshare includes generated patches and vulnerability analysis results.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No Dockerfile, requirements.txt, or pinned tool versions are provided in the paper; specific versions of CodeQL, Semgrep, and Bandit used are not reported.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "The methodology is described at a conceptual level; step-by-step reproduction instructions are not present in the paper and presumably reside only in the external replication package.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Results are reported with p-values and Cliff's delta effect sizes but no confidence intervals or error bars are provided for main vulnerability count results.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Chi-squared tests (code snippet presence), Spearman's rank correlation (project-level factors), and Mann-Whitney-based Cliff's delta (code-level factors) are used throughout RQ3.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Cliff's delta is systematically reported for all code-level comparisons in Table 3, alongside p-values.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "No power analysis is conducted; the very small vulnerable samples for ACR (n=3) and HC (n=2) are noted as limiting quantitative inference but not formally justified.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Tables report means (Vuln Mean vs All Mean) without standard deviations or interquartile ranges; distributions are shown in Figure 2 only qualitatively.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Developer-written patches from the same GitHub issues serve as a direct baseline throughout RQ1, with counts reported side-by-side in Table 1.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Developer patches are the ground-truth PRs from the exact same issues evaluated, making them a direct and appropriate baseline rather than a historical or weak comparison.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": false,
    191           "answer": false,
    192           "justification": "This is a comparative empirical study; ablation is not applicable as there is no single configurable component being tested.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Vulnerability counts, CWE type distributions, Cliff's delta effect sizes, chi-squared statistics, and Spearman's correlations are all used across the three research questions.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "The first author manually inspected all majority-vote flagged instances to remove false positives; two annotators independently labeled issue type, bug type, and information completeness with reported inter-rater agreement (kappa >0.9).",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": false,
    209           "answer": false,
    210           "justification": "This is not a prediction task; the study measures vulnerability rates in generated code rather than training a classifier.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Table 2 provides CWE-type breakdowns per patch source; Table 5 provides bug-type breakdowns; Table 4 provides per-framework code-level metric comparisons.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Concrete failure examples are provided: Llama using eval() in Qiskit (CWE-95), Llama downgrading to MD5 for password hashing (CWE-327), OpenHands constructing raw SQL in Django (CWE-89).",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "RQ3 systematically reports that project-level factors (repository size, cyclomatic complexity, maintainability index, contributor count) show weak, non-significant correlations with vulnerability rates across all systems.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Llama 3.3 Instruct (70B), OpenHands+CodeAct v2.1 with Claude-3.5-Sonnet-20241022, and AutoCodeRover-v2.0 with Claude-3.5-Sonnet-20241022 are all specified with snapshot dates; HoneyComb lacks a version identifier but is a minor omission.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "Prompts are described as 'the prompt structure used in SWE-bench, with slight modifications' to produce complete files rather than diffs, but actual prompt text is not included in the paper.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "No temperature, top-p, max tokens, or other inference hyperparameters are reported for any of the models.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": false,
    254           "justification": "The paper describes OH's broad autonomy vs. ACR's constrained edits at a high level but defers to external papers for scaffold details; internal loop structures, tool calls, and iteration limits are not described.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "The process for extracting pre/post-patch file snapshots, applying patches via the modified SWE-bench harness, and running three static analysis tools with majority voting is described in detail in Section 2.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "The replication package at the figshare URL (reference [3]) is described as making data and research artifacts publicly accessible.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Developer patches are collected via the GitHub REST API from merged PRs; SWE-bench provides issue-PR pairs; the process for obtaining agentic patches from publicly released results is described.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No participant recruitment — the study uses SWE-bench as a standard public benchmark.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The full pipeline from patch collection → pre/post-patch extraction → three-tool static analysis → majority voting → manual inspection is documented step-by-step in Sections 2.3–2.4.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Llama 3.3's training data cutoff is never stated; the paper does not address whether SWE-bench issues predate the model's training cutoff.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "No discussion of whether Llama 3.3 was trained on SWE-bench issues or the corresponding GitHub repositories; this is a gap given the benchmark's public availability.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "SWE-bench issues are from public GitHub repositories predating Llama 3.3's release; potential contamination of training data with these issues is not addressed.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants in the IRB sense; study uses automated benchmark data.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human subjects study; no IRB applicable.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "The paper cites ACR's published cost ($0.70/instance from their paper) to explain why full train-set agentic evaluation is infeasible, but does not report the actual inference cost of their own Llama patch generation.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "Only a vague statement that Llama generation 'took well over a week' is given; no GPU hours, API costs, or hardware specifications are reported.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Llama 3.3 (70B) introduces 11x more new vulnerabilities than developers on the same SWE-bench issues (135 vs. 12 after manual validation).",
    373       "evidence": "Table 1 reports majority-vote counts of 185 (Llama) vs. 17 (developers), reduced to 135 vs. 12 after manual inspection across the full train+test set of 21,294 issues.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "LLM-generated patches exhibit distinctive vulnerability patterns (eval injection CWE-95, insecure deserialization CWE-502) that are rare in developer code.",
    378       "evidence": "Table 2 shows CWE-95 in 39 Llama patches vs. 1 developer patch, and CWE-502 in 17 Llama patches vs. 2 developer patches; the paper notes no overlap between confirmed Llama and developer vulnerabilities in the same files.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Agentic frameworks with greater autonomy (OpenHands) produce significantly more vulnerabilities than those with constrained edit scope (AutoCodeRover, HoneyComb).",
    383       "evidence": "Table 1 reports 44 confirmed OH vulnerabilities vs. 3 (ACR) and 2 (HC) on the same test set, attributed to OH's broad autonomy to run commands, edit any files, and generate scripts.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Vulnerable LLM patches are associated with modifications to a larger number of files, not larger code volume.",
    388       "evidence": "Table 3 shows significant differences in files modified (1.72 vs. 1.25, p<0.001, δ=0.20) but fewer LOC in vulnerable patches (1740 vs. 2218, p<0.001), suggesting breadth not volume drives risk.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Missing issue context (no code snippets, no expected behavior, no reproduction steps) is associated with vulnerable Llama patches.",
    393       "evidence": "Only 47.4% of vulnerable-instance issues contain code snippets vs. 58.6% overall (χ²=6.44, p=0.011, φ=0.017); only 39% of bug-related vulnerable issues included expected behavior.",
    394       "supported": "moderate"
    395     }
    396   ],
    397   "methodology_tags": [
    398     "benchmark-eval",
    399     "observational"
    400   ],
    401   "key_findings": "Llama 3.3 (70B) introduces 11 times more new security vulnerabilities than developers on the same 21,000 SWE-bench GitHub issues (135 vs. 12 manually confirmed), with LLM-specific patterns including eval injection (CWE-95) and insecure deserialization (CWE-502) that are rare in developer code. Agentic frameworks also generate vulnerabilities, with OpenHands—given the broadest autonomy—producing an order of magnitude more than AutoCodeRover or HoneyComb. Vulnerable patches across all systems are associated with multi-file, scattered edits rather than large code volume, suggesting cross-file reasoning failures drive risk. At the issue level, missing context (code snippets, expected behavior, reproduction steps) is the strongest predictor of Llama vulnerability introduction, while project-level properties (repository size, complexity, contributor count) show no significant association.",
    402   "red_flags": [
    403     {
    404       "flag": "Unequal evaluation scope",
    405       "detail": "Llama is evaluated on the full 21,294-issue train+test set while agentic frameworks are evaluated on only 2,294 test-set issues, making raw vulnerability counts structurally incomparable even though the paper acknowledges this and avoids direct comparisons."
    406     },
    407     {
    408       "flag": "Oracle retrieval confound",
    409       "detail": "Llama is given the exact files modified by developers (oracle retrieval) to isolate code generation quality, but this differs from both developer and agent settings; this could systematically bias the vulnerability comparison in ways not fully discussed."
    410     },
    411     {
    412       "flag": "Extremely small agent vulnerable samples",
    413       "detail": "ACR (n=3) and HC (n=2) confirmed vulnerabilities are too few for statistical analysis, yet qualitative patterns from these are discussed; the paper acknowledges this but findings for these frameworks remain anecdotal."
    414     },
    415     {
    416       "flag": "No hyperparameters reported",
    417       "detail": "Temperature, top-p, and other inference parameters are not reported for any model, making exact reproduction impossible even with the replication package."
    418     },
    419     {
    420       "flag": "Contamination unaddressed",
    421       "detail": "Llama 3.3's training data cutoff is not stated, and no discussion of whether SWE-bench issues (from public GitHub repos) may have been in training data; this could affect both patch quality and vulnerability patterns."
    422     }
    423   ],
    424   "cited_papers": [
    425     {
    426       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    427       "relevance": "Primary benchmark used; provides the 20,000+ GitHub issue dataset for all analyses"
    428     },
    429     {
    430       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    431       "relevance": "One of three agentic frameworks evaluated for security vulnerabilities in generated patches"
    432     },
    433     {
    434       "title": "AutoCodeRover: Autonomous Program Improvement",
    435       "relevance": "One of three agentic frameworks evaluated; constrained edit behavior contrasted with OpenHands"
    436     },
    437     {
    438       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    439       "relevance": "Foundational prior work on LLM code security that this study extends to real-world APR context at larger scale"
    440     },
    441     {
    442       "title": "Do users write more insecure code with AI assistants?",
    443       "relevance": "Prior work on human-LLM security interaction; contrasts with this study's focus on autonomous patch generation"
    444     },
    445     {
    446       "title": "Security weaknesses of copilot generated code in github",
    447       "relevance": "Prior large-scale study of LLM code security on real GitHub projects; most similar predecessor study"
    448     },
    449     {
    450       "title": "Adversarial Bug Reports as a Security Risk in Language Model-Based Automated Program Repair",
    451       "relevance": "Closely related adversarial APR security study that this work complements with non-adversarial analysis"
    452     },
    453     {
    454       "title": "Red Teaming Program Repair Agents: When Correct Patches can Hide Vulnerabilities",
    455       "relevance": "Complementary red-teaming study of APR agents; highlights that test-passing does not imply security"
    456     },
    457     {
    458       "title": "Anomalicious: Automated Detection of Anomalous and Potentially Malicious Commits on GitHub",
    459       "relevance": "Foundational abnormal commit detection work that motivates the RQ3 risk factor analysis"
    460     },
    461     {
    462       "title": "An empirical study of static analysis tools for secure code review",
    463       "relevance": "Justifies use of multiple static analysis tools in combination to maximize recall and precision"
    464     }
    465   ],
    466   "engagement_factors": {
    467     "practical_relevance": {
    468       "score": 3,
    469       "justification": "Directly actionable for teams deploying APR tools: provides specific risk signals (multi-file edits, missing issue context) and recommends CI/CD integration of risk assessment before code review."
    470     },
    471     "surprise_contrarian": {
    472       "score": 2,
    473       "justification": "The 11x vulnerability rate is striking and the finding that greater autonomy amplifies rather than mitigates risk (counter to the value proposition of advanced agents) challenges the APR adoption narrative."
    474     },
    475     "fear_safety": {
    476       "score": 3,
    477       "justification": "Raises direct security risk concerns: LLM-generated patches may introduce latent exploitable vulnerabilities (arbitrary code execution, SQL injection, credential compromise) that bypass functional tests."
    478     },
    479     "drama_conflict": {
    480       "score": 2,
    481       "justification": "The framing that AI-generated fixes may silently corrupt codebases with novel vulnerability patterns is alarming, and the finding implicates widely-used systems (OpenHands, AutoCodeRover) used on SWE-bench leaderboard."
    482     },
    483     "demo_ability": {
    484       "score": 1,
    485       "justification": "Replication package on figshare exists, but reproducing the full pipeline requires running expensive LLM inference and three static analysis tools across 20,000+ issues; not easily demoed."
    486     },
    487     "brand_recognition": {
    488       "score": 2,
    489       "justification": "Uses SWE-bench (well-known benchmark), evaluates OpenHands and AutoCodeRover (top SWE-bench leaderboard entries), and uses Claude-3.5-Sonnet as the underlying LLM for two frameworks."
    490     }
    491   },
    492   "hn_data": {
    493     "threads": [
    494       {
    495         "hn_id": "43905563",
    496         "title": "(How) Do reasoning models reason?",
    497         "points": 3,
    498         "comments": 0,
    499         "url": "https://news.ycombinator.com/item?id=43905563"
    500       },
    501       {
    502         "hn_id": "43751796",
    503         "title": "(How) Do reasoning models reason?",
    504         "points": 2,
    505         "comments": 0,
    506         "url": "https://news.ycombinator.com/item?id=43751796"
    507       },
    508       {
    509         "hn_id": "27786663",
    510         "title": "Telelife: The Future of Remote Living",
    511         "points": 2,
    512         "comments": 0,
    513         "url": "https://news.ycombinator.com/item?id=27786663"
    514       },
    515       {
    516         "hn_id": "44179940",
    517         "title": "Stop Anthropomorphizing Intermediate Tokens as Reasoning/Thinking Traces",
    518         "points": 1,
    519         "comments": 0,
    520         "url": "https://news.ycombinator.com/item?id=44179940"
    521       },
    522       {
    523         "hn_id": "42141765",
    524         "title": "Are Large Language Models Consistent over Value-Laden Questions?",
    525         "points": 1,
    526         "comments": 0,
    527         "url": "https://news.ycombinator.com/item?id=42141765"
    528       },
    529       {
    530         "hn_id": "36857159",
    531         "title": "Origin of Life Molecules in the Atmosphere After Big Impacts on the Early Earth",
    532         "points": 1,
    533         "comments": 0,
    534         "url": "https://news.ycombinator.com/item?id=36857159"
    535       }
    536     ],
    537     "top_points": 3,
    538     "total_points": 10,
    539     "total_comments": 0
    540   }
    541 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs