scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28361B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Evolving Excellence: Automated Optimization of LLM-based Agents",
      6     "authors": [
      7       "Paul Brookes",
      8       "Vardan Voskanyan",
      9       "Rafail Giavrimis",
     10       "Matthew Truscott",
     11       "Mina Ilieva",
     12       "Chrystalla Pavlou",
     13       "Alexandru Staicu",
     14       "Manal Adham",
     15       "Will Evers-Hood",
     16       "Jingzhi Gong",
     17       "Kejia Zhang",
     18       "Matvey Fedoseev",
     19       "Vishal Sharma",
     20       "Roman Bauer",
     21       "Zheng Wang",
     22       "Hema Nair",
     23       "Wei Jie",
     24       "Tianhua Xu",
     25       "Aurora Constantin",
     26       "Carmine Ventre",
     27       "Leslie Kanthan",
     28       "Michail Basios"
     29     ],
     30     "year": 2025,
     31     "venue": "arXiv.org",
     32     "arxiv_id": "2512.09108",
     33     "doi": "10.48550/arXiv.2512.09108"
     34   },
     35   "checklist": {
     36     "claims_and_evidence": {
     37       "abstract_claims_supported": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "The abstract's specific claims (13.6% ALE, 10.1% Mini-SWE, 36.9% CrewAI token reduction, 22% MathTales accuracy) are all substantiated by empirical results in Section 6 with reported statistics.",
     41         "source": "haiku"
     42       },
     43       "causal_claims_justified": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper attributes improvements to evolutionary optimization but never ablates whether the evolutionary mechanism itself (vs. any prompt variation) is causal; no comparison against random search or manual prompt engineering is included.",
     47         "source": "haiku"
     48       },
     49       "generalization_bounded": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 7 explicitly bounds claims: 'automated optimization is not universally beneficial' and conditions for success (poorly-tuned agents, well-defined metrics) are spelled out; project-level variance in Mini-SWE is also acknowledged.",
     53         "source": "haiku"
     54       },
     55       "alternative_explanations_discussed": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The central alternative — that any prompt rewrite (manual or random) would yield similar gains — is never considered. The paper does not discuss whether the evolutionary mechanism adds value over simpler baselines.",
     59         "source": "haiku"
     60       },
     61       "proxy_outcome_distinction": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Claims map directly to measured outcomes: acceptance rate is directly counted, performance score comes from benchmark execution, token cost is directly measured; no conflation of proxy metrics with broader constructs.",
     65         "source": "haiku"
     66       }
     67     },
     68     "limitations_and_scope": {
     69       "limitations_section_present": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Section 6.6 'Key Insights and Limitations' and Section 8 'Conclusion, Limitations, and Future Work' both contain dedicated limitations discussions with specific examples beyond a single concluding sentence.",
     73         "source": "haiku"
     74       },
     75       "threats_to_validity_specific": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Specific threats are named: benchmark overfitting ('improvements on benchmarks may not translate to real-world usage'), project-level variance in Mini-SWE (pylint -0.1%), non-significant ALE results (p=0.10), and small sample sizes limiting validation.",
     79         "source": "haiku"
     80       },
     81       "scope_boundaries_stated": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The paper explicitly states Artemis 'works best for tasks with objective solving approaches, well-defined success criteria, and measurable outcomes' and that 'well-tuned agents may offer limited room for further optimization.'",
     85         "source": "haiku"
     86       }
     87     },
     88     "conflicts_of_interest": {
     89       "funding_disclosed": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The acknowledgment section explicitly discloses EU Horizon 2020 Grant 101008280 (DIOR) as funding source with a CORDIS project URL.",
     93         "source": "haiku"
     94       },
     95       "affiliations_disclosed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "All authors list institutional affiliations in the paper header; multiple authors are identified as affiliated with TurinTech AI, the commercial developer of Artemis.",
     99         "source": "haiku"
    100       },
    101       "funder_independent_of_outcome": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "EU Horizon 2020 is an independent public research funder with no financial stake in Artemis's commercial success.",
    105         "source": "haiku"
    106       },
    107       "financial_interests_declared": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "TurinTech AI employees evaluate their own commercial Artemis platform. No competing interests statement or financial interests declaration is present in the paper.",
    111         "source": "haiku"
    112       }
    113     },
    114     "scope_and_framing": {
    115       "key_terms_defined": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Definition 1 formally defines 'agent configuration' as C = (P, T, M, Θ) covering prompts, tool descriptions, model assignments, and continuous parameters; the optimization objective is formalized in Equation 1.",
    119         "source": "haiku"
    120       },
    121       "intended_contribution_clear": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Four numbered contributions are explicitly enumerated in the introduction: the Artemis platform, novel semantic mutation/crossover operators, systematic experiments with statistical validation, and analysis of optimization success factors.",
    125         "source": "haiku"
    126       },
    127       "engagement_with_prior_work": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 2 organizes related work into four paradigms and Table 1 provides structured comparative analysis across five dimensions, clearly positioning Artemis relative to APE, PromptBreeder, ADAS, AFlow, AlphaEvolve, and DSPy.",
    131         "source": "haiku"
    132       }
    133     }
    134   },
    135   "type_checklist": {
    136     "empirical": {
    137       "artifacts": {
    138         "code_released": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Section 7 states 'we are going to open source the code for all four case study agents as supplementary material' — a future promise only; the complete Artemis platform itself cannot be shared.",
    142           "source": "haiku"
    143         },
    144         "data_released": {
    145           "applies": true,
    146           "answer": true,
    147           "justification": "All evaluation benchmarks are standard public resources: AtCoder Heuristic Contest, SWE-Perf, Math Odyssey, and GSM8K are publicly available.",
    148           "source": "haiku"
    149         },
    150         "environment_specified": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "No requirements.txt, Dockerfile, or dependency specifications are provided. Model names (Claude 3.5 Sonnet, Qwen2.5-7B) are given without snapshot dates or API version identifiers.",
    154           "source": "haiku"
    155         },
    156         "reproduction_instructions": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "No step-by-step reproduction instructions are provided. The Artemis platform is proprietary, and the promised agent code has not yet been released at time of publication.",
    160           "source": "haiku"
    161         }
    162       },
    163       "statistical_methodology": {
    164         "confidence_intervals_or_error_bars": {
    165           "applies": true,
    166           "answer": true,
    167           "justification": "95% CIs are explicitly reported for ALE results (baseline [0.594, 0.726], optimized [0.689, 0.811]); other experiments report p-values from hypothesis tests supporting variance estimates.",
    168           "source": "haiku"
    169         },
    170         "significance_tests": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Mann-Whitney U test used for Mini-SWE (p<0.005); p-values reported throughout for all comparisons including CrewAI cost (p<10^-6) and MathTales (p<0.001).",
    174           "source": "haiku"
    175         },
    176         "effect_sizes_reported": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "Percentage improvements with baseline values are given for all experiments (e.g., 66.0%→75.0% for ALE, 12033→7329 tokens for CrewAI), providing effect size context throughout.",
    180           "source": "haiku"
    181         },
    182         "sample_size_justified": {
    183           "applies": true,
    184           "answer": false,
    185           "justification": "No power analysis or statistical justification is provided for sample sizes (40 ALE problems, 140 Mini-SWE functions, 30 CrewAI problems per run, 50/300 MathTales problems). Sizes appear benchmark-determined.",
    186           "source": "haiku"
    187         },
    188         "variance_reported": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Variance is captured via 95% CIs for ALE, 12 evaluation runs for CrewAI with run-level comparison, and 3 repeated runs for MathTales; non-parametric testing for Mini-SWE incorporates distributional spread.",
    192           "source": "haiku"
    193         }
    194       },
    195       "evaluation_design": {
    196         "baselines_included": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Pre-optimization baselines are included for all four agents with reported metrics: ALE 0.660, Mini-SWE 0.891, CrewAI accuracy 0.82 and 12033 tokens, MathTales accuracy 0.59.",
    200           "source": "haiku"
    201         },
    202         "baselines_contemporary": {
    203           "applies": true,
    204           "answer": false,
    205           "justification": "The 'baselines' are unoptimized agent configurations only; no empirical comparison against other optimization methods (DSPy, APE, PromptBreeder, random search) is conducted. Table 1 comparison is purely conceptual.",
    206           "source": "haiku"
    207         },
    208         "ablation_study": {
    209           "applies": true,
    210           "answer": false,
    211           "justification": "No ablation isolates the evolutionary mechanism from alternatives. The ALE prompt-vs-search comparison tests two optimization strategies but does not ablate whether the genetic algorithm itself adds value over simpler approaches.",
    212           "source": "haiku"
    213         },
    214         "multiple_metrics": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Multiple metrics are used across experiments: acceptance rate, performance score, accuracy, completeness, token cost, and per-project breakdowns for Mini-SWE.",
    218           "source": "haiku"
    219         },
    220         "human_evaluation": {
    221           "applies": false,
    222           "answer": false,
    223           "justification": "All evaluations use automated benchmark scoring (acceptance rate, performance score, token count, mathematical correctness); human evaluation is not relevant for these tasks.",
    224           "source": "haiku"
    225         },
    226         "held_out_test_set": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "MathTales-Teacher uses a separate validation set (50 problems for optimization selection) and evaluation set (300 problems for final assessment). CrewAI uses stratified sampling with held-out test problems.",
    230           "source": "haiku"
    231         },
    232         "per_category_breakdown": {
    233           "applies": true,
    234           "answer": true,
    235           "justification": "Mini-SWE reports per-project results across 9 Python libraries (requests +20%, scikit-learn +29%, astropy +62%, pylint -0.1%). MathTales reports accuracy and completeness separately.",
    236           "source": "haiku"
    237         },
    238         "failure_cases_discussed": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "Section 7 explicitly discusses failure cases: pylint -0.1%, CrewAI accuracy slight decrease, ALE non-significance, and the principle that well-tuned agents show limited improvement potential.",
    242           "source": "haiku"
    243         },
    244         "negative_results_reported": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "CrewAI accuracy decreased 4% (p=0.277); ALE improvements did not reach significance (p=0.10); Mini-SWE showed project-level negative variance (pylint). All are reported honestly.",
    248           "source": "haiku"
    249         }
    250       },
    251       "setup_transparency": {
    252         "model_versions_specified": {
    253           "applies": true,
    254           "answer": false,
    255           "justification": "Mini-SWE uses 'Claude 3.5 Sonnet' without snapshot date. The LLM ensemble used internally by Artemis is never specified. Only 'Qwen2.5-7B' provides a version identifier without weights checkpoint.",
    256           "source": "haiku"
    257         },
    258         "prompts_provided": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Before/after prompts are shown verbatim in Figures 5, 7, 13, and 14, covering all four agent case studies with full prompt text.",
    262           "source": "haiku"
    263         },
    264         "hyperparameters_reported": {
    265           "applies": true,
    266           "answer": false,
    267           "justification": "GA parameters are sparsely reported: population size 3 and 2 generations mentioned only for MathTales. Temperature, top-p, and LLM inference parameters are never reported for any experiment.",
    268           "source": "haiku"
    269         },
    270         "scaffolding_described": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "The Artemis optimization pipeline (component discovery, local/global optimization, hierarchical evaluation with LLM scoring then benchmark execution) is described in Section 4 with figures. Each agent's architecture is described.",
    274           "source": "haiku"
    275         },
    276         "data_preprocessing_documented": {
    277           "applies": true,
    278           "answer": false,
    279           "justification": "Stratified sampling is mentioned for CrewAI (30 from 387 problems) and MathTales (50 from GSM8K), but the stratification criteria and problem selection procedures are not documented sufficiently to reproduce.",
    280           "source": "haiku"
    281         }
    282       },
    283       "data_integrity": {
    284         "raw_data_available": {
    285           "applies": true,
    286           "answer": false,
    287           "justification": "No raw evaluation outputs, optimization logs, or per-run results are released. The Artemis platform output cannot be independently verified.",
    288           "source": "haiku"
    289         },
    290         "data_collection_described": {
    291           "applies": true,
    292           "answer": true,
    293           "justification": "The paper uses established public benchmarks (AtCoder Heuristic Contest, SWE-Perf, Math Odyssey, GSM8K) with citations, and evaluation metric definitions are provided for each benchmark.",
    294           "source": "haiku"
    295         },
    296         "recruitment_methods_described": {
    297           "applies": false,
    298           "answer": false,
    299           "justification": "Standard public benchmarks are used; no participant recruitment is involved.",
    300           "source": "haiku"
    301         },
    302         "data_pipeline_documented": {
    303           "applies": true,
    304           "answer": false,
    305           "justification": "The optimization pipeline is described conceptually but no code is released. The full path from problem input to fitness evaluation cannot be independently verified or reproduced.",
    306           "source": "haiku"
    307         }
    308       },
    309       "contamination": {
    310         "training_cutoff_stated": {
    311           "applies": true,
    312           "answer": false,
    313           "justification": "Training data cutoffs for Claude 3.5 Sonnet and Qwen2.5-7B are not stated anywhere in the paper, despite using benchmarks (especially GSM8K from 2021) that are almost certainly in training data.",
    314           "source": "haiku"
    315         },
    316         "train_test_overlap_discussed": {
    317           "applies": true,
    318           "answer": false,
    319           "justification": "Potential overlap between training data and benchmarks (particularly GSM8K published 2021, likely memorized) is never discussed; the paper does not acknowledge that models may have seen these problems.",
    320           "source": "haiku"
    321         },
    322         "benchmark_contamination_addressed": {
    323           "applies": true,
    324           "answer": false,
    325           "justification": "GSM8K (2021) was almost certainly included in Claude 3.5 Sonnet's and Qwen2.5-7B's training data, directly undermining validity of the MathTales-Teacher results. This is never addressed.",
    326           "source": "haiku"
    327         }
    328       },
    329       "human_studies": {
    330         "pre_registered": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants.",
    334           "source": "haiku"
    335         },
    336         "irb_or_ethics_approval": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants.",
    340           "source": "haiku"
    341         },
    342         "demographics_reported": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants.",
    346           "source": "haiku"
    347         },
    348         "inclusion_exclusion_criteria": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants.",
    352           "source": "haiku"
    353         },
    354         "randomization_described": {
    355           "applies": false,
    356           "answer": false,
    357           "justification": "No human participants.",
    358           "source": "haiku"
    359         },
    360         "blinding_described": {
    361           "applies": false,
    362           "answer": false,
    363           "justification": "No human participants.",
    364           "source": "haiku"
    365         },
    366         "attrition_reported": {
    367           "applies": false,
    368           "answer": false,
    369           "justification": "No human participants.",
    370           "source": "haiku"
    371         }
    372       },
    373       "cost_and_practicality": {
    374         "inference_cost_reported": {
    375           "applies": true,
    376           "answer": true,
    377           "justification": "Per-evaluation costs are explicitly reported: $24-26 per ALE run, $30-60 per Mini-SWE run, with per-problem token costs for CrewAI (12033 vs 7329 average tokens).",
    378           "source": "haiku"
    379         },
    380         "compute_budget_stated": {
    381           "applies": true,
    382           "answer": true,
    383           "justification": "Total optimization time is reported: 671.7 hours for ALE (411.2h prompt + 260.5h search), 9 hours for Mini-SWE, with variable time noted for others.",
    384           "source": "haiku"
    385         }
    386       }
    387     }
    388   },
    389   "claims": [
    390     {
    391       "claim": "Artemis achieves a 13.6% improvement in ALE Agent acceptance rate on AtCoder Heuristic Contest",
    392       "evidence": "Acceptance rate increased from 0.660 (95% CI: [0.594, 0.726]) to 0.750 (95% CI: [0.689, 0.811]); p=0.10, not statistically significant",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "Artemis achieves a statistically significant 10.1% performance improvement for Mini-SWE Agent on SWE-Perf",
    397       "evidence": "Performance score increased from 0.891 to 0.981 using Mann-Whitney U test (p<0.005); apply rate and correctness maintained at 92.1% and 87.9%",
    398       "supported": "strong"
    399     },
    400     {
    401       "claim": "Artemis achieves a 36.9% reduction in token cost for CrewAI Agent on Math Odyssey with only a non-significant 4% accuracy decrease",
    402       "evidence": "Average token count reduced from 12033 to 7329 (p<10^-6); accuracy decrease -3.7% (p=0.277) over 12 evaluation runs",
    403       "supported": "strong"
    404     },
    405     {
    406       "claim": "Artemis improves MathTales-Teacher Agent accuracy by 22% on GSM8K across a 300-problem evaluation set",
    407       "evidence": "Accuracy 0.59→0.81 and completeness 0.796→0.917 (p<0.001) across 3 repeated runs; however GSM8K contamination is unaddressed",
    408       "supported": "moderate"
    409     },
    410     {
    411       "claim": "Joint multi-component optimization captures interdependencies that isolated component optimization misses",
    412       "evidence": "Stated as motivation in the introduction but no empirical ablation compares joint vs. isolated optimization",
    413       "supported": "unsupported"
    414     },
    415     {
    416       "claim": "Evolutionary optimization outperforms manual trial-and-error for LLM agent configuration tuning",
    417       "evidence": "Only compared against unoptimized baselines; no comparison against manual prompt engineering, random search, or other automated methods",
    418       "supported": "weak"
    419     }
    420   ],
    421   "methodology_tags": [
    422     "benchmark-eval",
    423     "case-study"
    424   ],
    425   "key_findings": "Artemis, a commercial evolutionary optimization platform by TurinTech AI, demonstrates statistically significant improvements for three of four evaluated agent systems: 10.1% for Mini-SWE code optimization (p<0.005), 36.9% token cost reduction for CrewAI math reasoning (p<10^-6), and 22% accuracy gain for a small Qwen2.5-7B model on GSM8K (p<0.001). The ALE competitive programming improvement (13.6%) did not reach statistical significance (p=0.10). Optimization effectiveness depends on initial configuration quality — poorly-tuned agents benefit most — and on task characteristics favoring well-defined objective metrics. Critical limitations include the absence of comparison against other optimization methods, no code release, potential GSM8K benchmark contamination, and an undisclosed commercial conflict of interest from the authors evaluating their own product.",
    426   "red_flags": [
    427     {
    428       "flag": "Undisclosed commercial conflict of interest",
    429       "detail": "TurinTech AI employees evaluate their own commercial Artemis platform. No competing interests statement is present despite the obvious financial interest in positive results."
    430     },
    431     {
    432       "flag": "Non-significant primary result prominently featured",
    433       "detail": "The 13.6% ALE improvement (p=0.10) fails the α=0.05 threshold yet is highlighted in the abstract, introduction, and conclusion as a key result."
    434     },
    435     {
    436       "flag": "No comparison against other optimization methods",
    437       "detail": "No empirical comparison against DSPy, APE, PromptBreeder, random search, or manual prompt engineering. The claim of superiority rests entirely on comparison to unoptimized baselines."
    438     },
    439     {
    440       "flag": "Code not released at publication",
    441       "detail": "Source code is promised as future open source but is unavailable at time of publication. The core Artemis platform cannot be shared, making reproduction impossible."
    442     },
    443     {
    444       "flag": "GSM8K contamination unaddressed",
    445       "detail": "GSM8K (published 2021) was almost certainly in training data for both Claude 3.5 Sonnet and Qwen2.5-7B. The 22% accuracy improvement may partly reflect prompts that better elicit memorized answers rather than improved reasoning."
    446     },
    447     {
    448       "flag": "Trivially small evolutionary search",
    449       "detail": "MathTales uses only population size 3 and 2 generations — equivalent to evaluating 6 total prompt variants. This is functionally indistinguishable from exhaustive search over a tiny candidate set."
    450     }
    451   ],
    452   "cited_papers": [
    453     {
    454       "title": "Large Language Models are Human-Level Prompt Engineers (APE)",
    455       "relevance": "Foundational automated prompt optimization; direct prior work that Artemis builds upon and claims to extend beyond with full agent configuration optimization"
    456     },
    457     {
    458       "title": "PromptBreeder: Self-Referential Self-Improvement via Prompt Evolution",
    459       "relevance": "Evolutionary prompt optimization approach; closest prior work to Artemis's genetic algorithm methodology for natural language component optimization"
    460     },
    461     {
    462       "title": "Automated Design of Agentic Systems (ADAS)",
    463       "relevance": "Code-structure-based agent workflow optimization; Artemis explicitly positions itself as superior by being architecture-agnostic rather than requiring code-level access"
    464     },
    465     {
    466       "title": "AFlow: Automating Agentic Workflow Generation",
    467       "relevance": "MCTS-based workflow optimization; second key comparison target in Table 1 for positioning Artemis's generality"
    468     },
    469     {
    470       "title": "Why Do Multi-Agent LLM Systems Fail? MAST: Multi-Agent System Failure Taxonomy",
    471       "relevance": "Taxonomy of 14 failure modes motivating the need for systematic agent configuration analysis and optimization"
    472     },
    473     {
    474       "title": "SWE-Bench: Can Language Models Resolve Real-World GitHub Issues?",
    475       "relevance": "Primary code agent benchmark; cited to contextualize Artemis's 57% resolution rate claim and as standard of comparison for code agent capability"
    476     },
    477     {
    478       "title": "Training Verifiers to Solve Math Word Problems (GSM8K)",
    479       "relevance": "Evaluation benchmark used for MathTales-Teacher experiment; one of the most widely used math reasoning benchmarks in LLM research"
    480     },
    481     {
    482       "title": "AlphaEvolve: A Coding Agent for Scientific and Algorithmic Discovery",
    483       "relevance": "Related evolutionary LLM coding agent demonstrating novel algorithmic improvements through closed-loop generation and verification"
    484     },
    485     {
    486       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    487       "relevance": "Core reasoning-action architecture used in MathTales-Teacher agent; foundational agent framework being optimized by Artemis"
    488     },
    489     {
    490       "title": "SWE-Perf: Can Language Models Optimize Code Performance on Real-World Repositories?",
    491       "relevance": "Benchmark used for Mini-SWE Agent evaluation across 9 Python libraries; establishes the evaluation protocol for code performance optimization"
    492     }
    493   ],
    494   "engagement_factors": {
    495     "practical_relevance": {
    496       "score": 2,
    497       "justification": "Practitioners facing LLM agent configuration overhead can directly apply the optimization concept, though Artemis is commercial and not open-source."
    498     },
    499     "surprise_contrarian": {
    500       "score": 1,
    501       "justification": "The finding that cost optimization (36.9% token reduction) can be achieved with negligible accuracy loss is mildly surprising; most other results confirm expected behavior."
    502     },
    503     "fear_safety": {
    504       "score": 0,
    505       "justification": "No AI safety or risk concerns raised; the work focuses entirely on prompt optimization for benchmark performance."
    506     },
    507     "drama_conflict": {
    508       "score": 1,
    509       "justification": "Undisclosed commercial conflict of interest and prominent reporting of a non-significant result could attract critical methodological scrutiny."
    510     },
    511     "demo_ability": {
    512       "score": 1,
    513       "justification": "Artemis has a commercial web interface, but it is not publicly available or open-source; practitioners cannot easily try it without engaging TurinTech AI."
    514     },
    515     "brand_recognition": {
    516       "score": 1,
    517       "justification": "TurinTech AI is a lesser-known commercial startup; use of Claude 3.5 Sonnet and GSM8K provides name recognition for the benchmarks but not the lab."
    518     }
    519   },
    520   "hn_data": {
    521     "threads": [
    522       {
    523         "hn_id": "25471098",
    524         "title": "Causality Is Graphically Simple",
    525         "points": 90,
    526         "comments": 7,
    527         "url": "https://news.ycombinator.com/item?id=25471098",
    528         "created_at": "2020-12-18T19:48:36Z"
    529       },
    530       {
    531         "hn_id": "45574705",
    532         "title": "StreamingVLM: Real-Time Understanding for Infinite Video Streams",
    533         "points": 33,
    534         "comments": 0,
    535         "url": "https://news.ycombinator.com/item?id=45574705",
    536         "created_at": "2025-10-14T00:02:18Z"
    537       },
    538       {
    539         "hn_id": "45591789",
    540         "title": "StreamingVLM: Real-Time Understanding for Infinite Video Streams",
    541         "points": 1,
    542         "comments": 0,
    543         "url": "https://news.ycombinator.com/item?id=45591789",
    544         "created_at": "2025-10-15T13:02:15Z"
    545       },
    546       {
    547         "hn_id": "42362464",
    548         "title": "RoboHanger: Learning Generalizable Robotic Hanger Insertion for Diverse Garments",
    549         "points": 1,
    550         "comments": 0,
    551         "url": "https://news.ycombinator.com/item?id=42362464",
    552         "created_at": "2024-12-09T02:05:35Z"
    553       }
    554     ],
    555     "top_points": 90,
    556     "total_points": 125,
    557     "total_comments": 7
    558   }
    559 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs