scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26818B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "GreenServ: Energy-Efficient Context-Aware Dynamic Routing for Multi-Model LLM Inference",
      6     "authors": [
      7       "Thomas Ziller",
      8       "Shashikant Ilager",
      9       "Alessandro Tundo",
     10       "Ezio Bartocci",
     11       "Leonardo Mariani",
     12       "Ivona Brandic"
     13     ],
     14     "year": 2026,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2601.17551",
     17     "doi": "10.48550/arXiv.2601.17551"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All abstract claims are verified: 22% accuracy gain and 31% energy reduction vs. random are reported in §6.3.1 with 50-run CIs; RouterBench results (71.7% avg, 75.7% peak) appear in Table 1.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Causal claims are supported by controlled experiments with 50 runs, 95% confidence intervals, an ablation study over contextual features (§6.2.3), and a model-addition adaptability experiment (§6.2.4).",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The limitations section explicitly states results are from 'controlled environments' on a single A100 GPU and do not account for concurrency, queuing, or varied hardware, clearly bounding the scope of generalization.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "The paper acknowledges that all contextual baselines (ε-Greedy, Thompson Sampling) achieve comparable performance to LinUCB, concluding that 'feature engineering, model pool, and reward design are critical factors' rather than the specific bandit algorithm.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper explicitly distinguishes its direct GPU energy measurement (via zeus library in watt-hours) from proxy metrics used in prior work (API costs, token budgets), and accuracy is measured via objective EM/ROUGE against ground truth.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 6.4 'Discussion' contains a dedicated paragraph listing five specific limitations beyond generic disclaimers.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Specific threats include: MAB stationary reward assumption vulnerable to distribution drift; evaluation restricted to tasks with objective ground truth (EM/ROUGE) only; hardware-specific latency profiles; sensitivity to cluster count K and bin count N hyperparameters; controlled environment lacking concurrency and queuing.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper explicitly states it 'focus[es] exclusively on tasks where accuracy can be measured objectively' and that operational conditions with concurrency and batch processing are outside the scope of the current study.",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding acknowledgment or grant disclosure appears anywhere in the paper.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All six authors' institutional affiliations (TU Wien, University of Amsterdam, University of Milano-Bicocca) are listed on the title page.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No funding was disclosed, making this criterion not applicable.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "There is no competing interests statement, conflicts of interest declaration, or patent/equity disclosure anywhere in the paper.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 3 formally defines accuracy (Eq. 1 variant), energy consumption (Eq. 1), the routing problem (Eq. 2), regret (Eqs. 7–8), and the scalarized reward function (Eq. 5) with all parameters explained.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper lists four explicit contributions in the introduction: adaptive routing framework, multi-feature query context representation, comprehensive baseline evaluation, and extensive empirical evaluation.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 2 categorizes prior routing work into three generations (static, embedding-based, dynamic/adaptive) and explicitly identifies three gaps GreenServ addresses: limited continual learning, proxy cost metrics, and calibration overhead for new models.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "Code is released at an anonymous open-science repository (anonymous.4open.science/r/llm-inference-router-EBEA) for review; it is publicly accessible but under an anonymous pre-publication link rather than a permanent repository.",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "All five evaluation datasets (MMLU, HellaSwag, Winogrande, GSM8K, CNN/DM) are standard publicly available benchmarks loaded via HuggingFace datasets.",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "The paper lists specific libraries used (FastAPI, Redis, PostgreSQL, sentence-transformers, scikit-learn, transformers, PyTorch, NumPy, zeus) and Python 3.10, but provides no requirements.txt, Dockerfile, or equivalent formal dependency specification.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "The experimental setup is described in §6.1 and the algorithm in §4–5, but there are no step-by-step instructions for reproducing the full pipeline; the anonymous repo README is referenced but not described in the paper.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": true,
    151           "justification": "The paper states 'Results include 95% confidence intervals, where appropriate' and Figure 2 shows error bars on accuracy and energy comparisons from 50 runs.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "The paper relies solely on confidence intervals and visual comparison of distributions; no formal statistical significance tests (t-tests, Wilcoxon, etc.) are applied to comparative claims.",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Effect sizes are reported as percentage improvements: 22% accuracy gain, 31% energy reduction vs. random; 64–77% energy reduction vs. static baselines; median regret reductions quantified in §6.3.1.",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "The choice of 500 samples per dataset (2,500 total queries) and 50 experimental runs is stated but not justified by power analysis or other formal reasoning.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": true,
    175           "justification": "Variance is reported via 95% CIs in Figures 2–4, shaded confidence bands in Figure 3, and distribution boxplots across 50 runs in Figure 5.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Four static baselines (Random, Largest, Smallest, Highest Accuracy) and two alternative MAB baselines (ε-Greedy NC/C, Thompson Sampling C) are included.",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "The model pool includes recent models (Gemma-3, Qwen2.5, Phi-4, Llama-3.2) published in 2024–2025, and RouterBench (2024) is used for external validation; baselines are not artificially weak.",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Section 6.2.3 performs a full ablation over all single and combined feature configurations (None, Task, Cluster, Complexity, Task+Cluster, Task+Complexity, Cluster+Complexity, Full) across 50 runs each.",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Results are reported on normalized accuracy, total energy consumption (Wh), cumulative regret, moving-average regret, AIQ score (RouterBench), model selection frequency, and overhead latency.",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": false,
    206           "answer": false,
    207           "justification": "The evaluation uses objective automatic metrics (EM, ROUGE) against ground truth; human evaluation is not applicable to this system performance study.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "500 instances are sampled from the test set partition of each benchmark using a fixed random seed, with no overlap with any training data used by the routing system.",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Model selection frequency is broken down per model in Figures 6–8; RouterBench results are averaged across 9 tasks; the feature ablation provides per-feature-configuration breakdowns.",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "The cold start period (high erratic regret for ~50 initial queries) is explicitly shown in Figure 3 and discussed; limitations of stationary reward assumptions for distribution drift are acknowledged.",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "The ablation reveals that adding all features raises regret above task-only configurations, attributed to increased dimensionality slowing convergence; Cluster alone slightly reduces regret while Complexity alone slightly increases it.",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "Table 2 in the appendix lists all 16 models with exact HuggingFace identifiers (e.g., 'Qwen/Qwen2.5-0.5B-Instruct', 'google/gemma-3-27b-it', 'microsoft/Phi-4-mini-instruct').",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": false,
    245           "justification": "The paper describes extracting 'instruction text from the initial lines of the prompt' but does not provide actual prompt templates, formatting, or system instructions used during inference.",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "All key hyperparameters are reported in §6.1.5: LinUCB α=0.1, λ_reg=0.05; ε-Greedy ε₀=1.0, δ=0.98, ε_min=0.01; CTS σ=0.01; K=3 clusters, N_bins=3 complexity bins.",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "GreenServ is a routing system using a bandit algorithm, not an LLM agent with scaffolding; models are invoked as black-box inference endpoints.",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Preprocessing is documented: datasets sampled with fixed random seed (500 per test set), embeddings via all-MiniLM-L6-v2, task labels from dataset origin, complexity via Flesch score with equal-width binning, online K-Means clustering.",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "Raw experimental outputs (per-query energy measurements, accuracy logs across 50 runs) are not described as being released; only the code and standard benchmark datasets are referenced.",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Data collection is described: benchmark datasets loaded via HuggingFace, 500 instances sampled from test partitions with fixed random seed, energy measured via zeus GPU power monitoring, latency via Python time module.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": false,
    282           "answer": false,
    283           "justification": "No human participants; standard public benchmarks were used without recruitment.",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The full pipeline from query arrival through feature extraction, model selection, inference, monitoring (accuracy, energy, latency), reward computation, and MAB update is documented in Algorithm 1 and §4–5.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": false,
    296           "answer": false,
    297           "justification": "The paper evaluates a routing system's efficiency, not LLM generalization capabilities; benchmark contamination of the underlying models is not the object of study.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": false,
    302           "answer": false,
    303           "justification": "Not applicable; the paper's claim is about relative routing performance gains, not absolute model benchmark scores as a measure of model capability.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": false,
    308           "answer": false,
    309           "justification": "Not applicable for the same reason as above; the routing framework's value is assessed relative to baselines, not as an uncontaminated capability estimate.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No human participants in this study.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants in this study.",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants in this study.",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants in this study.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants in this study.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants in this study.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants in this study.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": true,
    361           "justification": "Energy consumption in Wh is a primary metric throughout; overhead latency per component is reported in Table 4 (6.68–7.77 ms total); per-model inference latency statistics appear in Table 3.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "Per-experiment energy is reported (~165 Wh for GreenServ at λ=0.4 over 2,500 queries) but the total compute budget for running all experiments (50 runs × multiple configurations) is not disclosed.",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "GreenServ achieves 22% higher accuracy and 31% lower energy consumption compared to random routing at λ=0.4",
    376       "evidence": "Figure 2a and §6.3.1 summary, based on 50 runs with 95% confidence intervals; GreenServ ≈0.65 normalized accuracy vs. random ≈0.51, and ≈165 Wh vs. random baseline",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "GreenServ achieves 71.7% average accuracy and 75.7% peak accuracy on RouterBench across ~36k queries",
    381       "evidence": "Table 1 reports AIQ=0.607, Peak Acc=75.7%, Avg Acc=71.7% for GreenServ on RouterBench external benchmark",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Task type is the single most informative contextual feature, dropping median cumulative regret to ~400",
    386       "evidence": "Figure 5 ablation across 50 runs shows task feature alone achieves ~400 median regret, while cluster reduces by 17 and complexity increases by 7 compared to no-context baseline",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "GreenServ adapts to new model addition without requiring offline recalibration",
    391       "evidence": "Figure 6 shows Gemma-3-12b reaches 20–25% selection frequency within ~100 queries after being added at step 1000, displacing previously preferred models",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "The routing and feature extraction overhead (6.68–7.77 ms) is negligible relative to LLM inference time",
    396       "evidence": "Table 4 reports component-level overhead; Table 3 shows median inference latencies range from 36.1 ms (Llama-3.2-1B) to 199.7 ms (Gemma-3-27B), making overhead 3.9–21.6%",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "Using full context features can degrade performance compared to task-type-only routing",
    401       "evidence": "Figure 5 shows Full features configuration has higher median cumulative regret than Task-only or Task+Cluster, attributed to increased dimensionality slowing MAB convergence",
    402       "supported": "moderate"
    403     },
    404     {
    405       "claim": "GreenServ operates at accuracy-energy points that exceed the static single-model Pareto front",
    406       "evidence": "Figure 2b shows GreenServ and contextual baselines positioned beyond the static Pareto front (red dashed line), achieving combinations no single model achieves alone",
    407       "supported": "strong"
    408     }
    409   ],
    410   "methodology_tags": [
    411     "benchmark-eval",
    412     "observational"
    413   ],
    414   "key_findings": "GreenServ demonstrates that context-aware online routing using a contextual multi-armed bandit (LinUCB) consistently outperforms both static single-model deployments and random routing, achieving 22% higher accuracy and 31% lower energy consumption compared to random selection at the tested operating point. The key driver of performance is the task-type contextual feature rather than the specific bandit algorithm, as all contextual variants (LinUCB, ε-Greedy, Thompson Sampling) achieve comparable results. GreenServ achieves accuracy-energy trade-off points that exceed the static Pareto front, meaning no single fixed model can replicate its combined efficiency. Routing overhead is negligible at 6.68–7.77 ms per query relative to LLM inference latencies of 36–200 ms.",
    415   "red_flags": [
    416     {
    417       "flag": "Single hardware configuration",
    418       "detail": "All experiments run on a single NVIDIA A100 80GB GPU; the paper acknowledges generalization to other GPU architectures requires separate latency profiling, limiting the practical scope of reported results."
    419     },
    420     {
    421       "flag": "Normalized accuracy uses estimated bounds",
    422       "detail": "Normalized accuracy uses min-max bounds estimated from 'strategically selected' small and large models on a validation set (Phi2-3B for min, Qwen2.5-32B for max), introducing potential bias if these do not represent true extremes on all tasks."
    423     },
    424     {
    425       "flag": "No formal significance testing",
    426       "detail": "Comparative performance claims rely on confidence interval overlap rather than formal hypothesis tests (t-test, Wilcoxon), making it unclear whether differences between similar contextual algorithms are statistically significant."
    427     },
    428     {
    429       "flag": "Anonymous-only code release",
    430       "detail": "Code is available via an anonymous review-only link (anonymous.4open.science), which may not persist post-publication; no permanent public repository is guaranteed."
    431     },
    432     {
    433       "flag": "Evaluation restricted to objective-metric tasks",
    434       "detail": "All benchmarks require ground-truth labels (EM/ROUGE); the framework's behavior on open-ended generation tasks that constitute the majority of production LLM usage is untested and acknowledged as a limitation."
    435     },
    436     {
    437       "flag": "No funding disclosure",
    438       "detail": "No funding source is declared anywhere in the paper, making it impossible to assess potential conflicts of interest."
    439     }
    440   ],
    441   "cited_papers": [
    442     {
    443       "title": "RouteLLM: Learning to Route LLMs from Preference Data",
    444       "relevance": "Primary related work on learning-based LLM routing; GreenServ directly compares its approach against the RouteLLM paradigm"
    445     },
    446     {
    447       "title": "RouterBench: A Benchmark for Multi-LLM Routing System",
    448       "relevance": "Used as the external validation benchmark for GreenServ; provides the AIQ metric and ~36k query evaluation set"
    449     },
    450     {
    451       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    452       "relevance": "Foundational work establishing cost-quality tradeoffs in LLM inference that GreenServ addresses with energy rather than API cost"
    453     },
    454     {
    455       "title": "A Contextual-Bandit Approach to Personalized News Article Recommendation",
    456       "relevance": "Introduces LinUCB, the core routing algorithm used in GreenServ"
    457     },
    458     {
    459       "title": "MixLLM: Dynamic Routing in Mixed Large Language Models",
    460       "relevance": "Closest prior work combining contextual bandits with continual learning for LLM routing; GreenServ differentiates by using direct energy measurement"
    461     },
    462     {
    463       "title": "Measuring Massive Multitask Language Understanding",
    464       "relevance": "MMLU benchmark used as one of five evaluation datasets in GreenServ experiments"
    465     },
    466     {
    467       "title": "LLM Bandit: Cost-Efficient LLM Generation via Preference-Conditioned Dynamic Routing",
    468       "relevance": "Directly related prior work using multi-armed bandits for LLM routing with cost objectives"
    469     },
    470     {
    471       "title": "GraphRouter: A Graph-Based Router for LLM Selections",
    472       "relevance": "Recent learning-based routing approach using graph neural networks that GreenServ positions against in the related work taxonomy"
    473     }
    474   ],
    475   "engagement_factors": {
    476     "practical_relevance": {
    477       "score": 3,
    478       "justification": "Directly addresses a real deployment cost problem with open-source code, 16 production models, and a framework anyone running multi-model inference can adopt."
    479     },
    480     "surprise_contrarian": {
    481       "score": 1,
    482       "justification": "The finding that context-aware routing beats static routing is expected; the more interesting finding that full features can hurt and task-type alone suffices adds mild surprise."
    483     },
    484     "fear_safety": {
    485       "score": 1,
    486       "justification": "Energy consumption framing taps into AI sustainability concerns but does not raise safety or risk concerns directly."
    487     },
    488     "drama_conflict": {
    489       "score": 0,
    490       "justification": "No controversy, conflict with prior work claims, or contested findings; straightforward systems paper."
    491     },
    492     "demo_ability": {
    493       "score": 2,
    494       "justification": "Anonymous code repo is publicly accessible and the system runs on standard HuggingFace models, making it feasible to try for someone with a GPU."
    495     },
    496     "brand_recognition": {
    497       "score": 1,
    498       "justification": "Academic paper from TU Wien and University of Amsterdam; no famous lab or widely-recognized product name attached."
    499     }
    500   },
    501   "hn_data": {
    502     "threads": [
    503       {
    504         "hn_id": "47150074",
    505         "title": "Large-Scale Study of GitHub Pull Requests: How AI Coding Agents Modify Code",
    506         "points": 2,
    507         "comments": 0,
    508         "url": "https://news.ycombinator.com/item?id=47150074",
    509         "created_at": "2026-02-25T11:15:17Z"
    510       }
    511     ],
    512     "top_points": 2,
    513     "total_points": 2,
    514     "total_comments": 0
    515   }
    516 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs