scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (31338B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Let the Barbarians In: How AI Can Accelerate Systems Performance Research",
      6     "authors": [
      7       "Audrey Cheng",
      8       "Shu Liu",
      9       "Melissa Pan",
     10       "Zhifei Li",
     11       "Shubham Agarwal",
     12       "Mert Cemri",
     13       "Bowen Wang",
     14       "Alexander Krentsel",
     15       "Tian Xia",
     16       "Jongseok Park",
     17       "Shuo Yang",
     18       "Jeff Chen",
     19       "Lakshya Agrawal",
     20       "Ashwin Naren",
     21       "Shulu Li",
     22       "Ruiying Ma",
     23       "Aditya Desai",
     24       "Jiarong Xing",
     25       "Koushik Sen",
     26       "Matei Zaharia",
     27       "Ion Stoica"
     28     ],
     29     "year": 2025,
     30     "venue": "arXiv.org",
     31     "arxiv_id": "2512.14806",
     32     "doi": "10.48550/arXiv.2512.14806"
     33   },
     34   "checklist": {
     35     "claims_and_evidence": {
     36       "abstract_claims_supported": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Abstract claims that ADRS can match or outperform human SOTA are backed by Table 2 across 10 tasks with specific numbers (13× speedup, 35% peak savings). Best-practices and future challenges are substantiated in Sections 5-6.",
     40         "source": "haiku"
     41       },
     42       "causal_claims_justified": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Causal claims about component contributions (seed diversity, feedback granularity, abstraction level) are supported by controlled ablation studies in Section 5 that vary one factor at a time across the same evaluation tasks.",
     46         "source": "haiku"
     47       },
     48       "generalization_bounded": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Section 6.1 explicitly bounds claims to problems with isolated changes, reliable evaluations, and efficient evaluations. The conclusion states 'it is still very early' and scope is limited to systems performance optimization.",
     52         "source": "haiku"
     53       },
     54       "alternative_explanations_discussed": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The paper discusses training data contamination (TXN/SMF case), reward hacking (EPLB, Cloudcast, MAS), overfitting to narrow workloads (Section 5.2), and simulator fidelity gaps as alternative explanations for observed results.",
     58         "source": "haiku"
     59       },
     60       "proxy_outcome_distinction": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper explicitly acknowledges that simulators are proxies for real systems (Section 3.2, Section 6.1) and that the performance gains are measured in simulator contexts, not production deployments.",
     64         "source": "haiku"
     65       }
     66     },
     67     "limitations_and_scope": {
     68       "limitations_section_present": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Section 6 'Limitations and Open Challenges' is a dedicated multi-page section covering problem suitability, evaluator design requirements, and open framework challenges.",
     72         "source": "haiku"
     73       },
     74       "threats_to_validity_specific": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Specific limitations named: ADRS fails on coordinated multi-module changes (Paxos, Raft), problems with undecidable semantic equivalence (arbitrary query rewrites), and tasks requiring hours of GPU time. Appendix C.3 categorizes 420 observed failure cases.",
     78         "source": "haiku"
     79       },
     80       "scope_boundaries_stated": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Section 6.1 explicitly lists three required properties and names out-of-scope problems: distributed protocols with interacting state machines and problems solvable by existing ILP solvers are explicitly excluded.",
     84         "source": "haiku"
     85       }
     86     },
     87     "conflicts_of_interest": {
     88       "funding_disclosed": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No funding acknowledgment appears in the paper text. The paper is from UC Berkeley but no grants, industry funding, or other financial support is disclosed.",
     92         "source": "haiku"
     93       },
     94       "affiliations_disclosed": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "All authors are listed as affiliated with UC Berkeley on the paper.",
     98         "source": "haiku"
     99       },
    100       "funder_independent_of_outcome": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "Author Lakshya Agrawal is a co-author of both this paper and GEPA (reference [5]), one of the three frameworks being evaluated. This overlap between paper authorship and evaluated system authorship is not disclosed anywhere in the paper.",
    104         "source": "haiku"
    105       },
    106       "financial_interests_declared": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "No competing interests statement or financial interests declaration appears in the paper. The author-GEPA co-authorship conflict is undisclosed.",
    110         "source": "haiku"
    111       }
    112     },
    113     "scope_and_framing": {
    114       "key_terms_defined": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "ADRS is formally defined in Section 3.2 with five named components. 'Verifier,' 'evaluator,' 'reward hacking,' and 'systems performance research' are all defined in context with examples.",
    118         "source": "haiku"
    119       },
    120       "intended_contribution_clear": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Section 1 explicitly states 'we introduce no new problems, algorithms, or mechanisms' and lists three specific contributions: capability demonstration across 10 case studies, best practices, and implications for the systems community.",
    124         "source": "haiku"
    125       },
    126       "engagement_with_prior_work": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 8 covers pre-LLM AI for systems, AlphaEvolve/OpenEvolve/GEPA frameworks, LLM coding assistants, and LLM-driven research automation, showing how ADRS builds on and extends these lineages.",
    130         "source": "haiku"
    131       }
    132     }
    133   },
    134   "type_checklist": {
    135     "empirical": {
    136       "artifacts": {
    137         "code_released": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No explicit code release from the authors. Config files are in Appendix D and evaluators reference prior open-source papers, but the case study implementations, evolved solutions, and extended simulators are not released.",
    141           "source": "haiku"
    142         },
    143         "data_released": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "Some evaluation data comes from public sources (ShareGPT, GSM8K, BIRD) but the multi-region CBL traces, telemetry traces, and evolved program archives are not released by the authors.",
    147           "source": "haiku"
    148         },
    149         "environment_specified": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "Config files in Appendix D specify model names and ADRS hyperparameters but omit Python/OS versions, hardware specs, and dependency versions needed to reproduce the simulator environments.",
    153           "source": "haiku"
    154         },
    155         "reproduction_instructions": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "No step-by-step reproduction guide is provided. Config files are given but evaluators from prior papers must be separately obtained and the per-case-study experimental setup is insufficiently detailed to follow without guessing.",
    159           "source": "haiku"
    160         }
    161       },
    162       "statistical_methodology": {
    163         "confidence_intervals_or_error_bars": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "Table 3 reports mean ± standard deviation over three runs for each framework-model combination across all 10 tasks.",
    167           "source": "haiku"
    168         },
    169         "significance_tests": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Comparative claims between ADRS and human SOTA are made without formal statistical significance testing. Only three repetitions are used with no p-values reported.",
    173           "source": "haiku"
    174         },
    175         "effect_sizes_reported": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Specific improvements are reported with baseline context: 13× speedup on EPLB, ~6% average cost savings on CBL, 17% improvement on CBL-Multi, 60% makespan reduction on TXN offline (Table 2).",
    179           "source": "haiku"
    180         },
    181         "sample_size_justified": {
    182           "applies": true,
    183           "answer": false,
    184           "justification": "Three repetitions per experiment are used but not justified. No power analysis is provided. Some std deviations in Table 3 are comparable in magnitude to the reported improvements.",
    185           "source": "haiku"
    186         },
    187         "variance_reported": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Table 3 reports standard deviation across three runs for all framework-model-task combinations.",
    191           "source": "haiku"
    192         }
    193       },
    194       "evaluation_design": {
    195         "baselines_included": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Each case study compares ADRS results against the human SOTA solution and/or a greedy baseline. Table 2 lists the specific SOTA publications alongside performance comparisons.",
    199           "source": "haiku"
    200         },
    201         "baselines_contemporary": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "Baselines are from recent top-venue publications: NSDI '24 (CBL, Cloudcast), VLDB '24 (TXN), MLSys '25 (LLM-SQL), and NeurIPS '25 (MAS). These are state-of-the-art at time of submission.",
    205           "source": "haiku"
    206         },
    207         "ablation_study": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "Section 5 reports ablation studies on seed diversity (LLM-SQL), feedback granularity at three levels (CBL), training set coverage (Appendix C.1), and hint quantity (EPLB, Cloudcast).",
    211           "source": "haiku"
    212         },
    213         "multiple_metrics": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Multiple metrics per task: EPLB uses load balance factor and runtime; CBL uses cost savings and deadline compliance; NS3 uses throughput and queue length; LLM-SQL uses PHR and algorithm runtime.",
    217           "source": "haiku"
    218         },
    219         "human_evaluation": {
    220           "applies": false,
    221           "answer": false,
    222           "justification": "Human evaluation of system outputs is not applicable; all evaluations are automated through simulator-based performance metrics appropriate for the optimization claims being made.",
    223           "source": "haiku"
    224         },
    225         "held_out_test_set": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "For CBL, 30% of traces are a feedback subset during evolution and results are reported on the full evaluation set (Section 4.1). LLM-SQL uses five recommendation datasets. Evaluation workloads are separate from evolutionary feedback signals.",
    229           "source": "haiku"
    230         },
    231         "per_category_breakdown": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "Table 3 provides per-task breakdowns across all framework-model combinations. Case studies give per-configuration breakdowns (e.g., per-hardware-profile in CBL, Figure 8; per-dataset in LLM-SQL).",
    235           "source": "haiku"
    236         },
    237         "failure_cases_discussed": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Cloudcast showed no improvement over SOTA. TXN online only matched SOTA. MAS v3 degraded to 30% due to reward hacking. Appendix C.3 categorizes failure patterns across 420 LLM-judged traces.",
    241           "source": "haiku"
    242         },
    243         "negative_results_reported": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "Table 2 explicitly reports Cloudcast as 'No improvement.' TXN online only rediscovered SMF. MAS v3 dropped from 53% to 30% success rate. These are presented openly alongside positive results.",
    247           "source": "haiku"
    248         }
    249       },
    250       "setup_transparency": {
    251         "model_versions_specified": {
    252           "applies": true,
    253           "answer": false,
    254           "justification": "Models identified as 'GPT-5' and 'Gemini-3.0-Pro-Preview' without snapshot dates or version hashes. The ShinkaEvolve appendix config lists different model names (gpt-4.1-mini, gemini-2.5-pro) than the main experiments, adding further ambiguity.",
    255           "source": "haiku"
    256         },
    257         "prompts_provided": {
    258           "applies": true,
    259           "answer": false,
    260           "justification": "Prompt design principles (three-part structure, abstraction level, hints) are described but actual prompts used for any case study are not provided. Appendix D contains framework YAML configs, not problem prompts.",
    261           "source": "haiku"
    262         },
    263         "hyperparameters_reported": {
    264           "applies": true,
    265           "answer": true,
    266           "justification": "Appendix D provides detailed YAML configs for all three frameworks including temperature (0.7), top_p (0.95), population_size (100), num_islands (5), migration_rate (0.1), and iteration counts.",
    267           "source": "haiku"
    268         },
    269         "scaffolding_described": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "Section 3.2 describes all five ADRS components (Prompt Generator, Solution Generator, Evaluator, Storage, Solution Selector) and the inner/outer loop architecture with Figure 1b. Table 1 details per-framework design choices.",
    273           "source": "haiku"
    274         },
    275         "data_preprocessing_documented": {
    276           "applies": true,
    277           "answer": false,
    278           "justification": "Trace sampling is mentioned (30% CBL training split, ShareGPT/GSM8K for EPLB) but specific preprocessing, filtering, normalization, or trace construction steps are not documented for most case studies.",
    279           "source": "haiku"
    280         }
    281       },
    282       "data_integrity": {
    283         "raw_data_available": {
    284           "applies": true,
    285           "answer": false,
    286           "justification": "Evolved programs, evaluation scores, and the 420 LLM-judged failure traces underlying Table 9 are not released. Multi-region CBL and telemetry traces are not made available.",
    287           "source": "haiku"
    288         },
    289         "data_collection_described": {
    290           "applies": true,
    291           "answer": false,
    292           "justification": "The PhD student survey (Appendix A) is described as 'a small survey of 31 PhD students' with no collection methodology. Simulator trace sources vary per task with minimal description of how workloads were assembled.",
    293           "source": "haiku"
    294         },
    295         "recruitment_methods_described": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "The 31 PhD student participants are described only as 'in broadly systems research at a US university.' No recruitment, selection, or consent methodology is described.",
    299           "source": "haiku"
    300         },
    301         "data_pipeline_documented": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "The ADRS loop architecture is documented but end-to-end pipelines from raw inputs to reported metrics are incomplete for most case studies. Score aggregation and normalization methods are partially described.",
    305           "source": "haiku"
    306         }
    307       },
    308       "contamination": {
    309         "training_cutoff_stated": {
    310           "applies": true,
    311           "answer": false,
    312           "justification": "Training data cutoffs for GPT-5 and Gemini-3.0-Pro-Preview are not stated anywhere in the paper, despite contamination being explicitly identified as a concern in the TXN case study.",
    313           "source": "haiku"
    314         },
    315         "train_test_overlap_discussed": {
    316           "applies": true,
    317           "answer": true,
    318           "justification": "Section 4.5 explicitly states OpenEvolve rediscovered SMF 'likely due to training data contamination from the SMF paper' and uses the offline result to reduce contamination concerns. This is a substantive, not boilerplate, discussion.",
    319           "source": "haiku"
    320         },
    321         "benchmark_contamination_addressed": {
    322           "applies": true,
    323           "answer": false,
    324           "justification": "Contamination is discussed only for TXN/SMF. For the other nine case studies where ADRS matches or beats published SOTA solutions, no assessment is made of whether those solutions appear in model training data.",
    325           "source": "haiku"
    326         }
    327       },
    328       "human_studies": {
    329         "pre_registered": {
    330           "applies": true,
    331           "answer": false,
    332           "justification": "The PhD student time-distribution survey (Appendix A) was not pre-registered.",
    333           "source": "haiku"
    334         },
    335         "irb_or_ethics_approval": {
    336           "applies": true,
    337           "answer": false,
    338           "justification": "No IRB or ethics approval is mentioned for the 31 PhD student survey.",
    339           "source": "haiku"
    340         },
    341         "demographics_reported": {
    342           "applies": true,
    343           "answer": false,
    344           "justification": "Survey participants are described only as '31 PhD students in broadly systems research at a US university.' No demographic breakdown is provided.",
    345           "source": "haiku"
    346         },
    347         "inclusion_exclusion_criteria": {
    348           "applies": true,
    349           "answer": false,
    350           "justification": "No formal inclusion/exclusion criteria are stated beyond 'PhD students in broadly systems research.' How the sample was bounded or selected is not described.",
    351           "source": "haiku"
    352         },
    353         "randomization_described": {
    354           "applies": false,
    355           "answer": false,
    356           "justification": "N/A — the survey is a descriptive questionnaire, not a randomized experiment.",
    357           "source": "haiku"
    358         },
    359         "blinding_described": {
    360           "applies": false,
    361           "answer": false,
    362           "justification": "N/A — blinding is not applicable to a time-distribution survey.",
    363           "source": "haiku"
    364         },
    365         "attrition_reported": {
    366           "applies": false,
    367           "answer": false,
    368           "justification": "N/A — attrition is not applicable to this single-point cross-sectional survey.",
    369           "source": "haiku"
    370         }
    371       },
    372       "cost_and_practicality": {
    373         "inference_cost_reported": {
    374           "applies": true,
    375           "answer": true,
    376           "justification": "Table 2 reports cost and wall-clock time per task (e.g., '5h (100 iters), ≤$15' for EPLB; '5h (100 iters), ≤$30' for CBL), enabling practical cost-benefit assessment.",
    377           "source": "haiku"
    378         },
    379         "compute_budget_stated": {
    380           "applies": true,
    381           "answer": false,
    382           "justification": "Per-task costs are reported in Table 2 but the total compute budget for the entire study is not stated. Researcher time for problem specification is acknowledged as significant but not measured.",
    383           "source": "haiku"
    384         }
    385       }
    386     }
    387   },
    388   "claims": [
    389     {
    390       "claim": "ADRS frameworks can generate solutions that match or outperform published human state-of-the-art for systems performance problems across 10 diverse case studies",
    391       "evidence": "Table 2 shows 9 of 10 tasks achieving near-SOTA or better performance in simulators; Cloudcast is the explicit exception with no improvement reported",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "ShinkaEvolve discovers a MoE load balancing algorithm 13× faster than a proprietary frontier-lab implementation while achieving the same load balance factor",
    396       "evidence": "Section 4.3 reports 1.51ms vs 19.5ms runtime; Table 3 shows consistent results (mean ± std) across 3 runs",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "ADRS achieves up to 35% higher cost savings than the NSDI '24 SOTA on the CBL spot scheduling task",
    401       "evidence": "Table 2 reports 'Up to 16% (average 7%) higher cost savings vs. SOTA'; Section 4.1 clarifies 35% is a per-trace maximum over Uniform Progress, average improvement is ~6%",
    402       "supported": "moderate"
    403     },
    404     {
    405       "claim": "Moderate feedback granularity outperforms both minimal and detailed feedback in evolutionary search",
    406       "evidence": "CBL ablation in Section 5.3: moderate (13.0%) > detailed (10.2%) > minimal (7.7%) cost reduction across feedback conditions",
    407       "supported": "moderate"
    408     },
    409     {
    410       "claim": "ADRS can discover cross-domain solutions humans might overlook, including techniques from political science, economics, and electrical engineering",
    411       "evidence": "Table 4 lists five specific cross-domain techniques (Hamilton's Apportionment, Borda Count, Kirchhoff's Current Law, UCB exploration) applied in case studies",
    412       "supported": "weak"
    413     },
    414     {
    415       "claim": "Diverse initialization seeds improve ADRS outcomes compared to homogeneous SOTA seeding",
    416       "evidence": "LLM-SQL ablation: diverse seed initialization reaches combined score 0.7755 vs. cap of 0.74 with homogeneous SOTA seeds across islands",
    417       "supported": "weak"
    418     },
    419     {
    420       "claim": "OpenEvolve achieves the highest success rate (9/20 cases best) with the most consistent cross-model performance",
    421       "evidence": "Table 3 aggregate: OpenEvolve delivers 4 top results with GPT-5 and 5 with Gemini-3, while GEPA and ShinkaEvolve show stronger model-specific sensitivity",
    422       "supported": "strong"
    423     }
    424   ],
    425   "methodology_tags": [
    426     "benchmark-eval",
    427     "case-study",
    428     "observational"
    429   ],
    430   "key_findings": "Three open-source ADRS frameworks (OpenEvolve, GEPA, ShinkaEvolve) can generate systems performance algorithms that match or exceed human state-of-the-art designs across 10 tasks at costs under $30 per task using 100 evolutionary iterations. Notable results include 13× speedup on MoE load balancing and 60% makespan reduction in offline transaction scheduling. Best practices are identified for specification (diverse seeds, right abstraction), evaluation (diverse workloads, reward hacking prevention via scoped edits), and feedback (calibrated granularity). ADRS is explicitly bounded to problems with isolated, efficiently-verifiable solutions and fails on coordinated multi-module changes. An undisclosed conflict exists: author Lakshya Agrawal co-authored GEPA, one of the three evaluated frameworks.",
    431   "red_flags": [
    432     {
    433       "flag": "Author-evaluatee conflict undisclosed",
    434       "detail": "Lakshya Agrawal is both a paper co-author and a co-author of GEPA (reference [5]), one of the three ADRS frameworks being evaluated and compared. This conflict of interest is not disclosed anywhere in the paper."
    435     },
    436     {
    437       "flag": "Simulator-to-production gap unquantified",
    438       "detail": "All 10 case studies use simulators rather than real system deployments. The fidelity gap between simulator performance and production behavior is acknowledged in Section 6 but never empirically quantified for any task."
    439     },
    440     {
    441       "flag": "Training contamination addressed for only 1 of 10 tasks",
    442       "detail": "Training data contamination is explicitly noted for TXN/SMF in Section 4.5 but not assessed for the other nine case studies, including tasks where AI matches SOTA solutions that plausibly appear in GPT/Gemini training corpora."
    443     },
    444     {
    445       "flag": "Three runs without power justification",
    446       "detail": "Only 3 repetitions per experiment are reported with no power analysis. Several Table 3 entries show standard deviations comparable in magnitude to differences between frameworks, making statistical reliability unclear."
    447     },
    448     {
    449       "flag": "Model versions not pinned",
    450       "detail": "GPT-5 and Gemini-3.0-Pro-Preview are used without snapshot dates. The ShinkaEvolve appendix config lists different model names (gpt-4.1-mini, gemini-2.5-pro) than the main paper, suggesting inconsistency in what was actually run."
    451     },
    452     {
    453       "flag": "Peak vs. average framing inflates headline claims",
    454       "detail": "Table 2 headline says 'Up to 35% greater savings' for CBL but average improvement is ~6-7%. The 35% figure is a single-trace maximum, not the typical or median result, and is potentially misleading without immediate qualification."
    455     }
    456   ],
    457   "cited_papers": [
    458     {
    459       "title": "AlphaEvolve: A Coding Agent for Scientific and Algorithmic Discovery",
    460       "relevance": "Proprietary ADRS framework from Google DeepMind; the direct inspirational predecessor to the open-source frameworks evaluated in this paper"
    461     },
    462     {
    463       "title": "GEPA: Reflective Prompt Evolution Can Outperform Reinforcement Learning",
    464       "relevance": "One of three evaluated frameworks; uses natural-language reflection and Pareto filtering for diverse high-performing solutions"
    465     },
    466     {
    467       "title": "ShinkaEvolve: Towards Open-Ended and Sample-Efficient Program Evolution",
    468       "relevance": "One of three evaluated frameworks; emphasizes structured introspection, correctness gating, and periodic meta-reflection"
    469     },
    470     {
    471       "title": "Can't Be Late: Optimizing Spot Instance Savings under Deadlines",
    472       "relevance": "NSDI '24 outstanding paper whose cloud scheduling algorithm serves as the primary SOTA baseline improved upon in Case Study 1"
    473     },
    474     {
    475       "title": "Towards Optimal Transaction Scheduling",
    476       "relevance": "VLDB '24 paper whose SMF algorithm ADRS rediscovers in the online setting and substantially improves (60%) in the offline setting"
    477     },
    478     {
    479       "title": "Why Do Multi-Agent LLM Systems Fail?",
    480       "relevance": "Co-authored by paper authors; provides the MAST failure taxonomy used to score the MAS case study evaluation"
    481     },
    482     {
    483       "title": "MLGym: A New Framework and Benchmark for Advancing AI Research Agents",
    484       "relevance": "Related work on benchmarking AI research agents; contextualizes ADRS within the broader AI-for-research movement"
    485     },
    486     {
    487       "title": "Faster Sorting Algorithms Discovered Using Deep Reinforcement Learning (AlphaDev)",
    488       "relevance": "Precedent for AI discovering novel algorithms surpassing human-designed solutions in low-level systems contexts"
    489     },
    490     {
    491       "title": "Optimizing LLM Queries in Relational Data Analytics Workloads",
    492       "relevance": "LLM-SQL SOTA paper whose GGR algorithm ADRS matches on PHR while achieving 3× runtime speedup via evolved prefix-aware reordering"
    493     },
    494     {
    495       "title": "PowerTCP: Pushing the Performance Limits of Datacenter Networks",
    496       "relevance": "NSDI '22 SOTA for datacenter congestion control; ADRS extends it in the NS3 case study, reducing queue length by 49%"
    497     }
    498   ],
    499   "engagement_factors": {
    500     "practical_relevance": {
    501       "score": 3,
    502       "justification": "Systems researchers can immediately apply the three open-source ADRS frameworks with the config files and best practices provided; multiple real tasks show meaningful improvements at under $30 per task."
    503     },
    504     "surprise_contrarian": {
    505       "score": 2,
    506       "justification": "The claim that AI already beats top-venue published human SOTA in systems research is moderately surprising; the specific cross-domain discoveries (Hamilton's Apportionment for GPU load balancing) are genuinely novel illustrations."
    507     },
    508     "fear_safety": {
    509       "score": 1,
    510       "justification": "Paper raises researcher role displacement but frames it entirely positively as elevation rather than replacement; no AI safety concerns are discussed."
    511     },
    512     "drama_conflict": {
    513       "score": 2,
    514       "justification": "'Let the Barbarians In' framing and the argument that researchers must embrace AI-driven methods or become obsolete creates a mild urgency narrative; prior work was titled 'Barbarians at the Gate.'"
    515     },
    516     "demo_ability": {
    517       "score": 2,
    518       "justification": "OpenEvolve and GEPA are open-source with configs provided, but reproducing specific case study results requires obtaining paper-specific simulators and workload traces from prior papers."
    519     },
    520     "brand_recognition": {
    521       "score": 2,
    522       "justification": "The Zaharia/Stoica group at UC Berkeley is highly regarded in systems research (Spark, Ray, vLLM lineage); NSDI and VLDB SOTA baselines lend credibility to the benchmarks chosen."
    523     }
    524   },
    525   "hn_data": {
    526     "threads": [
    527       {
    528         "hn_id": "45625666",
    529         "title": "What to do after detecting a signal from extraterrestrial intelligence",
    530         "points": 14,
    531         "comments": 15,
    532         "url": "https://news.ycombinator.com/item?id=45625666",
    533         "created_at": "2025-10-18T07:48:26Z"
    534       },
    535       {
    536         "hn_id": "45613026",
    537         "title": "Every Language Model Has a Forgery-Resistant Signature",
    538         "points": 7,
    539         "comments": 2,
    540         "url": "https://news.ycombinator.com/item?id=45613026",
    541         "created_at": "2025-10-17T03:22:16Z"
    542       },
    543       {
    544         "hn_id": "43193918",
    545         "title": "Ringworlds and Dyson spheres can be stable",
    546         "points": 6,
    547         "comments": 0,
    548         "url": "https://news.ycombinator.com/item?id=43193918",
    549         "created_at": "2025-02-27T12:48:58Z"
    550       },
    551       {
    552         "hn_id": "46196480",
    553         "title": "SETI Post-Detection Protocols: Progress Towards a New Version",
    554         "points": 4,
    555         "comments": 0,
    556         "url": "https://news.ycombinator.com/item?id=46196480",
    557         "created_at": "2025-12-08T19:24:37Z"
    558       },
    559       {
    560         "hn_id": "45629330",
    561         "title": "Every Language Model Has a Forgery-Resistant Signature",
    562         "points": 2,
    563         "comments": 1,
    564         "url": "https://news.ycombinator.com/item?id=45629330",
    565         "created_at": "2025-10-18T18:17:58Z"
    566       },
    567       {
    568         "hn_id": "43210649",
    569         "title": "A Comprehensive Survey on Concept Erasure in Text-to-Image Diffusion Models",
    570         "points": 2,
    571         "comments": 0,
    572         "url": "https://news.ycombinator.com/item?id=43210649",
    573         "created_at": "2025-02-28T20:56:28Z"
    574       },
    575       {
    576         "hn_id": "47172923",
    577         "title": "Midtraining Bridges Pretraining and Posttraining Distributions",
    578         "points": 1,
    579         "comments": 0,
    580         "url": "https://news.ycombinator.com/item?id=47172923",
    581         "created_at": "2026-02-26T22:29:32Z"
    582       },
    583       {
    584         "hn_id": "44825079",
    585         "title": "Aligning LLMs to Ask Good Questions a Case Study in Clinical Reasoning",
    586         "points": 1,
    587         "comments": 0,
    588         "url": "https://news.ycombinator.com/item?id=44825079",
    589         "created_at": "2025-08-07T14:39:48Z"
    590       },
    591       {
    592         "hn_id": "38772282",
    593         "title": "YAYI 2",
    594         "points": 1,
    595         "comments": 0,
    596         "url": "https://news.ycombinator.com/item?id=38772282",
    597         "created_at": "2023-12-26T15:00:58Z"
    598       }
    599     ],
    600     "top_points": 14,
    601     "total_points": 38,
    602     "total_comments": 18
    603   }
    604 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs