scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28103B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Does AI Code Review Lead to Code Changes? A Case Study of GitHub Actions",
      6     "authors": [
      7       "Kexin Sun",
      8       "Hongyu Kuang",
      9       "Sebastian Baltes",
     10       "Xin Zhou",
     11       "He Zhang",
     12       "Xiaoxing Ma",
     13       "Guoping Rong",
     14       "Dong Shao",
     15       "Christoph Treude"
     16     ],
     17     "year": 2025,
     18     "venue": "arXiv.org",
     19     "arxiv_id": "2508.18771",
     20     "doi": "10.48550/arXiv.2508.18771"
     21   },
     22   "checklist": {
     23     "claims_and_evidence": {
     24       "abstract_claims_supported": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "All major abstract claims are supported: 22,326 comments in 178 repos are reported in Table IV, wide effectiveness variation (0.9%–19.2%) in Table VIII, and SHAP analysis in Table X supports the design factors claim.",
     28         "source": "haiku"
     29       },
     30       "causal_claims_justified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The title frames the question causally ('Lead to Code Changes') and conclusions use directional language, but the study is purely observational; the authors explicitly acknowledge in Section VI that 'interpretations describe associations... not causal effects.'",
     34         "source": "haiku"
     35       },
     36       "generalization_bounded": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Section VI explicitly notes findings may not generalize to large-scale projects (most are ≤50 non-bot contributors), non-English repositories, or tools beyond the 16 studied; the language filter excluded 75% of comments.",
     40         "source": "haiku"
     41       },
     42       "alternative_explanations_discussed": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper does not seriously consider that coderabbitai's better addressing rate could reflect project-type selection bias (e.g., repos adopting it being more review-mature) rather than tool design; confounding between tool choice and repo characteristics is unaddressed.",
     46         "source": "haiku"
     47       },
     48       "proxy_outcome_distinction": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper clearly distinguishes that 'file-level code changes' is the proxy for 'comment addressed,' and explicitly acknowledges in Section VI that this 'does not capture the degree or impact of the resulting code changes.'",
     52         "source": "haiku"
     53       }
     54     },
     55     "limitations_and_scope": {
     56       "limitations_section_present": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section VI 'Threats to Validity' is a dedicated section covering construct, internal, and external validity with multiple specific threats.",
     60         "source": "haiku"
     61       },
     62       "threats_to_validity_specific": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Specific threats include: language filter removing 75% of comments (32 Korean-only repos excluded), github-action[bot] attribution ambiguity, restriction to small/medium projects (≤50 non-bot contributors), and data collection freeze at Feb 2025.",
     66         "source": "haiku"
     67       },
     68       "scope_boundaries_stated": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The paper explicitly states findings may not generalize to very large projects, non-English repositories, or tools not in their sample; RQ3 analysis is limited to 4 of the 16 actions.",
     72         "source": "haiku"
     73       }
     74     },
     75     "conflicts_of_interest": {
     76       "funding_disclosed": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No funding disclosure appears anywhere in the provided paper text.",
     80         "source": "haiku"
     81       },
     82       "affiliations_disclosed": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "All nine authors have institutional affiliations listed (Nanjing University, University of Bayreuth, Singapore Management University) with corresponding emails.",
     86         "source": "haiku"
     87       },
     88       "funder_independent_of_outcome": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No funding is disclosed, so independence cannot be assessed.",
     92         "source": "haiku"
     93       },
     94       "financial_interests_declared": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No competing interests or financial interests statement appears in the paper.",
     98         "source": "haiku"
     99       }
    100     },
    101     "scope_and_framing": {
    102       "key_terms_defined": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Key terms are defined: 'hunk' (a contiguous block of differing lines, citing GNU diffutils), 'addressing' (whether a comment led to code changes), and the three granularity levels (PR/file/hunk) are illustrated with examples in Figs 1–2.",
    106         "source": "haiku"
    107       },
    108       "intended_contribution_clear": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Three explicit contributions are listed in Section I: (1) systematic adoption study, (2) LLM-assisted framework for assessing comment addressing, (3) interpretable factor analysis via Random Forest + SHAP.",
    112         "source": "haiku"
    113       },
    114       "engagement_with_prior_work": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Section VII organizes related work into two thematic areas (AI code review automation and developer response to automated feedback), directly contrasting this study with prior work on human review usefulness factors and GenAI code review quality evaluation.",
    118         "source": "haiku"
    119       }
    120     }
    121   },
    122   "type_checklist": {
    123     "empirical": {
    124       "artifacts": {
    125         "code_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "The appendix [12] is described as 'to be published on Zenodo after acceptance' — this is a conditional promise of future release, not an actual release.",
    129           "source": "haiku"
    130         },
    131         "data_released": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "Dataset is likewise promised 'to be published on a preserved archive after acceptance'; not available at time of submission.",
    135           "source": "haiku"
    136         },
    137         "environment_specified": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "Python libraries used (FastText, PyYAML, difflib) are mentioned in passing but no requirements.txt, Dockerfile, or version-pinned dependency list is provided.",
    141           "source": "haiku"
    142         },
    143         "reproduction_instructions": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "Step-by-step instructions are referenced as part of the online appendix (not yet published); the paper text alone is insufficient to reproduce the pipeline without guessing implementation details.",
    147           "source": "haiku"
    148         }
    149       },
    150       "statistical_methodology": {
    151         "confidence_intervals_or_error_bars": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "Addressing rates and accuracy figures are reported as point estimates only; no confidence intervals or error bars accompany main results despite running LLMs five times.",
    155           "source": "haiku"
    156         },
    157         "significance_tests": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Fisher's exact test is used to compare addressing rates across trigger modes and LLM series in Table XI, with p-values reported.",
    161           "source": "haiku"
    162         },
    163         "effect_sizes_reported": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "Effect sizes are implicit in the addressing-rate comparisons (e.g., 60% human vs 0.9%–19.2% AI; 6.8% auto vs 12.8% manual for ID-1), and SHAP importance values quantify feature contribution magnitude.",
    167           "source": "haiku"
    168         },
    169         "sample_size_justified": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "The 150-comment annotation sample (50 per category) is stated without power analysis or justification for why this size is sufficient for the inter-rater agreement estimates.",
    173           "source": "haiku"
    174         },
    175         "variance_reported": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "LLMs were run five times for 'robust evaluation' but variance or standard deviation across runs is never reported; only single accuracy figures are presented.",
    179           "source": "haiku"
    180         }
    181       },
    182       "evaluation_design": {
    183         "baselines_included": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Human-authored review comments from the same repositories and time periods serve as a direct baseline; 60% human addressing rate vs 0.9–19.2% AI is the central comparison.",
    187           "source": "haiku"
    188         },
    189         "baselines_contemporary": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Human reviews are collected from the same repositories during the same time window as AI-generated comments, making them contemporaneous and ecologically valid comparators.",
    193           "source": "haiku"
    194         },
    195         "ablation_study": {
    196           "applies": true,
    197           "answer": false,
    198           "justification": "The paper compares model combinations for RQ2 (cross-combining top Stage-1 and Stage-2 models) and compares Random Forest vs logistic regression, but there is no ablation of the feature set or pipeline components.",
    199           "source": "haiku"
    200         },
    201         "multiple_metrics": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "Multiple metrics are used: overall accuracy, Cohen's κ (per-stage and full 6-class), addressing rates, SHAP importance values, and Fisher's exact test p-values across different breakdowns.",
    205           "source": "haiku"
    206         },
    207         "human_evaluation": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "Two independent annotators (one author + one external graduate student) labeled 150 sampled comments with inter-rater agreement measured by Cohen's κ (0.674–0.764); a third author resolved disagreements.",
    211           "source": "haiku"
    212         },
    213         "held_out_test_set": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "The Random Forest classifier uses an 80/20 train/test split, and the 150 annotated comments serve as the held-out test set for evaluating the LLM classification framework.",
    217           "source": "haiku"
    218         },
    219         "per_category_breakdown": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Results are broken down by review granularity (PR/file/hunk), by individual action ID, by trigger mode (auto/manual), by LLM series (GPT-3.5/GPT-4), and by author experience bins.",
    223           "source": "haiku"
    224         },
    225         "failure_cases_discussed": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Section V.A discusses failure modes explicitly: vague outputs ('Without more context, it is difficult to provide further suggestions'), hallucinated style warnings, and generic code summaries are cited as concrete failure examples.",
    229           "source": "haiku"
    230         },
    231         "negative_results_reported": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "The central finding is that 74.9% of valid AI comments are not addressed; mattzcarey/code-review-gpt achieves only 0.9% addressing; and 37.1% of repositories declared an action but generated zero comments.",
    235           "source": "haiku"
    236         }
    237       },
    238       "setup_transparency": {
    239         "model_versions_specified": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "Model names are listed (gpt-4.1, o3-mini, claude-3-sonnet, deepseek-r1, etc.) but exact API snapshot versions are deferred to the unpublished appendix; no snapshot dates are given in the paper.",
    243           "source": "haiku"
    244         },
    245         "prompts_provided": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "The Stage-1 and Stage-2 prompts are explicitly referred to the online appendix ('details of the LLM-assisted framework with specific prompts are available in the online appendix'), which is not yet published.",
    249           "source": "haiku"
    250         },
    251         "hyperparameters_reported": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Temperature=0 is explicitly reported for all LLM evaluations; five-run repetition is stated for robustness.",
    255           "source": "haiku"
    256         },
    257         "scaffolding_described": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "The two-stage LLM pipeline (Stage-1: validity detection → Stage-2: addressing assessment) is described in Section IV.B with the decoupled model selection rationale and classification scheme.",
    261           "source": "haiku"
    262         },
    263         "data_preprocessing_documented": {
    264           "applies": true,
    265           "answer": true,
    266           "justification": "Filtering steps are detailed: language detection via FastText, exclusion of non-merged PRs, first-in-thread restriction, bot account exclusion via login name pattern, and file renaming resolution via GitHub API.",
    267           "source": "haiku"
    268         }
    269       },
    270       "data_integrity": {
    271         "raw_data_available": {
    272           "applies": true,
    273           "answer": false,
    274           "justification": "Raw data is promised for Zenodo 'after acceptance' but is not currently available; the GitHub link is referenced as 'to be published on a preserved archive after acceptance.'",
    275           "source": "haiku"
    276         },
    277         "data_collection_described": {
    278           "applies": true,
    279           "answer": true,
    280           "justification": "GitHub REST API endpoints, search queries (e.g., 'repo:{repo_name} reviewed-by:github-actions[bot] is:pr'), filtering criteria, and diff reconstruction methods are described in detail in Section IV.",
    281           "source": "haiku"
    282         },
    283         "recruitment_methods_described": {
    284           "applies": true,
    285           "answer": false,
    286           "justification": "The two human annotators are described only as 'one author and an external graduate student in software engineering who is not a co-author'; no selection criteria or recruitment process is described.",
    287           "source": "haiku"
    288         },
    289         "data_pipeline_documented": {
    290           "applies": true,
    291           "answer": true,
    292           "justification": "The full pipeline is documented across Phases I–IV: GitHub API collection → language filtering → annotation sampling → LLM classification → Random Forest modeling, with dataset sizes at each stage reported.",
    293           "source": "haiku"
    294         }
    295       },
    296       "contamination": {
    297         "training_cutoff_stated": {
    298           "applies": false,
    299           "answer": false,
    300           "justification": "The study uses LLMs as classification tools on GitHub data, not as subjects in a benchmark evaluation; training data contamination of the benchmark is not applicable.",
    301           "source": "haiku"
    302         },
    303         "train_test_overlap_discussed": {
    304           "applies": false,
    305           "answer": false,
    306           "justification": "Not applicable — the paper is not evaluating LLM capabilities on a pre-existing benchmark that could appear in training data.",
    307           "source": "haiku"
    308         },
    309         "benchmark_contamination_addressed": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "No standard benchmark is used; the evaluation dataset is newly collected from GitHub, making benchmark contamination not applicable.",
    313           "source": "haiku"
    314         }
    315       },
    316       "human_studies": {
    317         "pre_registered": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "The human annotation is a methodological calibration step, not a human subjects study; pre-registration is not applicable.",
    321           "source": "haiku"
    322         },
    323         "irb_or_ethics_approval": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human subjects research requiring IRB approval; the annotation involves project team members rating code review comments.",
    327           "source": "haiku"
    328         },
    329         "demographics_reported": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "Not applicable; the annotators are project personnel, not research participants in a human subjects study.",
    333           "source": "haiku"
    334         },
    335         "inclusion_exclusion_criteria": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "Not applicable for this type of annotation task.",
    339           "source": "haiku"
    340         },
    341         "randomization_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "Not applicable; no randomized experiment involving human participants.",
    345           "source": "haiku"
    346         },
    347         "blinding_described": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "Not applicable; no human subjects experiment requiring blinding.",
    351           "source": "haiku"
    352         },
    353         "attrition_reported": {
    354           "applies": false,
    355           "answer": false,
    356           "justification": "Not applicable; no longitudinal human participant study.",
    357           "source": "haiku"
    358         }
    359       },
    360       "cost_and_practicality": {
    361         "inference_cost_reported": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "Multiple LLMs were run 5 times each on 150 annotated examples and then applied to 5,652 comments, but no inference cost or latency figures are reported.",
    365           "source": "haiku"
    366         },
    367         "compute_budget_stated": {
    368           "applies": true,
    369           "answer": false,
    370           "justification": "No total computational budget is stated anywhere in the paper.",
    371           "source": "haiku"
    372         }
    373       }
    374     }
    375   },
    376   "claims": [
    377     {
    378       "claim": "Valid AI-generated code review comments are addressed at rates of only 0.9%–19.2%, compared to 60% for human reviewers.",
    379       "evidence": "Table VIII reports per-action valid addressing rates: ID-2 at 0.9%, ID-1 at 4.2%, ID-4 at 6.5%, ID-3 at 19.2%, human at 60.0%.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Adoption of AI code review actions is highly concentrated: four actions account for 91.1% of repositories, 95.2% of pull requests, and 98.9% of comments.",
    384       "evidence": "Table IV shows ID-1 to ID-4 dominate usage; the paper states these figures explicitly in Section IV.A results.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Manually triggered AI code review comments are more likely to be addressed than automatically triggered ones.",
    389       "evidence": "Table XI shows 12.8% vs 6.8% for ID-1 (p≤0.05), 22.2% vs 0.5% for ID-2 (p≤0.05); SHAP directionality ρ=−0.97 for Trigger_auto in Table X.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Comments with a high code-to-text ratio (>0.5) are substantially more likely to be addressed.",
    394       "evidence": "Table XII shows addressing rate rises to 23.2% for AI comments in the highest code-text-ratio bin vs ~4–7% for low-ratio bins; SHAP importance rank 1 among comment features (ρ=0.89).",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "AI-generated comments targeting experienced contributors (>1,013 prior commits) are addressed at only 3.3%, compared to 16.1% for newcomers (≤30 commits).",
    399       "evidence": "Table XII (right) shows this pattern; SHAP directionality ρ=−0.67 for Author Prior Commits.",
    400       "supported": "moderate"
    401     },
    402     {
    403       "claim": "The LLM-assisted two-stage classification framework achieves 86.1% overall accuracy and 76.7% Cohen's κ on the full 6-class task.",
    404       "evidence": "Table VII reports these figures for the optimal cross-combined setup (gpt-4.1 for Stage-1, o3-mini for Stage-2) across all three comment source categories.",
    405       "supported": "strong"
    406     },
    407     {
    408       "claim": "37.1% of mature repositories declared an AI code review action but generated zero comments.",
    409       "evidence": "Stated explicitly in Section IV.A results with supporting numbers from Table IV (178 mature repos, many with zero comments).",
    410       "supported": "strong"
    411     }
    412   ],
    413   "methodology_tags": [
    414     "observational",
    415     "case-study"
    416   ],
    417   "key_findings": "AI-generated code review comments are addressed at dramatically lower rates (0.9%–19.2%) than human-written ones (60%), demonstrating that current tools have limited practical impact. Tool design matters enormously: hunk-level granularity, manual triggering, and comments rich in concrete code suggestions are the strongest positive predictors of developer responsiveness, while automatically triggered file-level comments are largely ignored. A two-stage LLM pipeline (gpt-4.1 for validity detection, o3-mini for addressing assessment) achieves 86.1% accuracy and 76.7% Cohen's κ, providing a scalable method to monitor tool effectiveness. The dominant 'one-in-one-out' paradigm — generating a comment for every change regardless of merit — is identified as a root cause of low precision and developer fatigue.",
    418   "red_flags": [
    419     {
    420       "flag": "Causal framing of observational data",
    421       "detail": "The title and conclusions use causal language ('lead to code changes', 'impact') but the design is entirely observational; the authors acknowledge associations not causation only in the threats section."
    422     },
    423     {
    424       "flag": "Severe language filter bias",
    425       "detail": "Applying an English-language filter removed 75% of comments (12,533 of 16,762), including 32 entirely Korean repositories; findings about addressing rates may not generalize to non-English developer communities."
    426     },
    427     {
    428       "flag": "Data and code not released",
    429       "detail": "The dataset, scripts, and prompts are all deferred to 'to be published on Zenodo after acceptance' — reproducibility is impossible without this material."
    430     },
    431     {
    432       "flag": "Tool-project confounding",
    433       "detail": "Projects that choose coderabbitai vs anc95/ChatGPT-CodeReview may differ systematically in review culture, size, and maturity; observed tool performance differences could reflect selection bias rather than tool quality."
    434     },
    435     {
    436       "flag": "Thin annotation validation set",
    437       "detail": "Only 150 comments (50 per category) were manually annotated to validate a framework applied to 5,652 comments; this is 2.7% of the analysis dataset, and the sample was drawn only from comments where the reviewed file was subsequently modified."
    438     },
    439     {
    440       "flag": "Cascading model error",
    441       "detail": "LLM-assigned labels are used as ground truth for training a Random Forest classifier, stacking two sources of classification error; the 88.5% Random Forest accuracy inherits the LLM's imperfect labels."
    442     }
    443   ],
    444   "cited_papers": [
    445     {
    446       "title": "Modern code review: a case study at Google",
    447       "relevance": "Foundational context on human code review practices that AI tools are augmenting or replacing."
    448     },
    449     {
    450       "title": "Characteristics of useful code reviews: An empirical study at Microsoft",
    451       "relevance": "Prior work on what makes human code review comments useful, providing comparative basis for evaluating AI-generated comments."
    452     },
    453     {
    454       "title": "Predicting usefulness of code review comments using textual features and developer experience",
    455       "relevance": "Directly related prior work on factors predicting comment usefulness; this paper's feature engineering is explicitly inspired by it."
    456     },
    457     {
    458       "title": "Github actions: the impact on the pull request process",
    459       "relevance": "Studies GitHub Actions' impact on software development workflows — the infrastructure platform central to this study."
    460     },
    461     {
    462       "title": "Automating code review activities by large-scale pre-training",
    463       "relevance": "Representative prior work on AI/ML-based code review automation, which this empirical study complements by measuring real-world impact."
    464     },
    465     {
    466       "title": "Automated code review in practice",
    467       "relevance": "Industrial study of AI code review deployment, complementary case study from a different setting."
    468     },
    469     {
    470       "title": "Llama-reviewer: Advancing code review automation with large language models through parameter-efficient fine-tuning",
    471       "relevance": "Recent work on LLM-based code review comment generation — directly in scope for this survey."
    472     },
    473     {
    474       "title": "On the use of GitHub actions in software development repositories",
    475       "relevance": "Empirical study of GitHub Actions adoption patterns, providing context for interpreting this paper's RQ1 findings."
    476     }
    477   ],
    478   "engagement_factors": {
    479     "practical_relevance": {
    480       "score": 3,
    481       "justification": "Directly actionable for any team evaluating or deploying AI code review tools — provides tool rankings, design recommendations, and a monitoring framework."
    482     },
    483     "surprise_contrarian": {
    484       "score": 2,
    485       "justification": "The 0.9%–19.2% vs 60% addressing rate gap challenges optimism about AI code review effectiveness, and the finding that manual triggering outperforms automatic review is counterintuitive."
    486     },
    487     "fear_safety": {
    488       "score": 1,
    489       "justification": "Raises concerns about AI review quality (hallucinated warnings, vague summaries) but not safety-critical outcomes."
    490     },
    491     "drama_conflict": {
    492       "score": 1,
    493       "justification": "Implicit product comparison (coderabbitai vs others) provides a ranking angle but no significant controversy."
    494     },
    495     "demo_ability": {
    496       "score": 2,
    497       "justification": "All 16 reviewed GitHub Actions are publicly available on GitHub Marketplace, making the findings immediately testable by practitioners."
    498     },
    499     "brand_recognition": {
    500       "score": 1,
    501       "justification": "No famous lab affiliation; coderabbitai is a recognized product name but the paper is from academic institutions."
    502     }
    503   },
    504   "hn_data": {
    505     "threads": [
    506       {
    507         "hn_id": "43198812",
    508         "title": "Symmetries of Living Systems",
    509         "points": 8,
    510         "comments": 0,
    511         "url": "https://news.ycombinator.com/item?id=43198812"
    512       },
    513       {
    514         "hn_id": "45367764",
    515         "title": "Fill probability estimates in institutional bond trading with quantum computers",
    516         "points": 2,
    517         "comments": 2,
    518         "url": "https://news.ycombinator.com/item?id=45367764"
    519       },
    520       {
    521         "hn_id": "44961416",
    522         "title": "Group Sequence Policy Optimization",
    523         "points": 2,
    524         "comments": 1,
    525         "url": "https://news.ycombinator.com/item?id=44961416"
    526       },
    527       {
    528         "hn_id": "44041341",
    529         "title": "Grounded in Context: Retrieval-Based Method for Hallucination Detection",
    530         "points": 1,
    531         "comments": 0,
    532         "url": "https://news.ycombinator.com/item?id=44041341"
    533       },
    534       {
    535         "hn_id": "43242677",
    536         "title": "FastAtlas: Real-Time Compact Atlases for Texture Space Shading",
    537         "points": 1,
    538         "comments": 0,
    539         "url": "https://news.ycombinator.com/item?id=43242677"
    540       },
    541       {
    542         "hn_id": "29567026",
    543         "title": "Transient execution flaws found in AMD Zen CPUs",
    544         "points": 1,
    545         "comments": 0,
    546         "url": "https://news.ycombinator.com/item?id=29567026"
    547       }
    548     ],
    549     "top_points": 8,
    550     "total_points": 15,
    551     "total_comments": 3
    552   }
    553 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs