scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27712B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "In-Context Distillation with Self-Consistency Cascades: A Simple, Training-Free Way to Reduce LLM Agent Costs",
      6     "authors": [
      7       "Vishnu Sarukkai",
      8       "Asanshay Gupta",
      9       "James Hong",
     10       "Michael Gharbi",
     11       "Kayvon Fatahalian"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2512.02543",
     16     "doi": "10.48550/arXiv.2512.02543"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All abstract claims are supported: 2.5× cost reduction on ALFWorld at teacher-level accuracy is confirmed by Table 1 (0.96 accuracy at 0.41 relative cost vs teacher 0.89/1.0), and Appendix C gives explicit USD cost figures ($0.059 → $0.024) with amortization analysis.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper makes causal claims about in-context distillation improving accuracy and cascades reducing cost; ablations (Student ZS vs IC vs IC+Cascade vs Cascade-only vs Random Mix) isolate the contribution of each component and provide adequate support for causal inference.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The abstract and conclusion claim the approach 'makes advanced agentic systems economically viable for a broader range of applications,' yet evaluation is limited to two benchmarks (ALFWorld, AppWorld); the benchmark selection rationale mentions structural-pattern sharing but the conclusions extend well beyond tested settings.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not consider whether any retrieved in-context examples (rather than teacher-specific demonstrations) would yield the same gains, nor whether performance improvements could stem from increased context length rather than knowledge transfer; comparisons are against model configurations, not content-controlled baselines.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper measures task success rate (binary, as defined by benchmark unit tests) and inference cost (USD via token counts), and claims are made precisely in terms of these same metrics without conflating them with broader constructs.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations or threats-to-validity section; Section 7 (Discussion) promotes the method's advantages and briefly contextualizes it against fine-tuning, but does not systematically enumerate limitations.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No specific threats to validity are discussed; there is no mention of single-run variance, benchmark contamination, distributional assumptions, or generalization risks to unseen task structures.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper notes it targets tasks with 'structural patterns enabling cross-task learning,' but there is no explicit statement of what the results do NOT show or what classes of tasks the method is likely to fail on.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The acknowledgments state: 'Support for this project was provided by Reve and Roblox, and API credits were provided by OpenAI and together.ai.'",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations (Stanford University and Reve) are listed on the title page; James Hong and Michael Gharbi are affiliated with Reve, which is also a funder.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Reve is both a funder and has co-authors (Hong, Gharbi) as employees; the method directly reduces costs of running LLM agents at scale, which is commercially relevant to Reve's business interests.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "There is no competing interests declaration, no statement about patents or equity; the acknowledgments list funders but do not address financial interests of the authors.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper defines 'in-context distillation' (Section 1 and 4.3), formally specifies teacher/student models and the optimization objective (Section 3), and explains ReAct-style agent architecture (Section 4.1).",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The intended contribution — a training-free, retrieval-based distillation method combined with self-consistency cascades for reducing LLM agent costs — is explicitly stated in the abstract and elaborated in Section 1.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 engages substantively with model distillation, consistency-based cascades, prompt optimization, and agent improvement lines of work, explaining how this paper differs from each rather than merely listing citations.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No code repository is mentioned anywhere in the paper; prompts are provided in Appendix B but the full implementation is not released.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Both ALFWorld and AppWorld are publicly available benchmarks used unmodified; demonstration data derives from running models on these public task sets.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements file, Dockerfile, or environment specification is provided; the paper only mentions closed-weight API access and MiniLM-L6-v2 as the embedding model without version pinning.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Algorithms 1 and 2 describe the method logic and Appendix B gives prompts, but there are no step-by-step instructions for reproducing the experiments from scratch, and no code is released.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results (accuracy, cost) are reported as single point estimates with no confidence intervals, standard deviations, or error bars across repeated runs.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are performed for any comparative claim; improvements are reported as raw accuracy differences without any hypothesis testing.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Effect sizes are reported in concrete terms: exact accuracy rates, cost multipliers (2.5×, 2×, 3.5×), and USD savings figures give sufficient context to interpret the magnitude of improvements.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The evaluation uses 134 test episodes (ALFWorld) and 168 test episodes (AppWorld) as determined by the benchmark splits; no justification for whether these sizes are adequate to detect the reported differences is provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "All results are single-run point estimates; no variance, standard deviation, or repeated-run statistics are reported for any condition.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "The paper includes teacher (upper bound), zero-shot student (lower bound), cascade-only, random mix at multiple ratios, and GPT-4.1 as baselines for comparison.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines use Claude Sonnet 4.5, GPT-4.1-mini, GPT-4.1, and Llama-3.3-70B — models that are current as of the paper's October 2025 pricing reference date.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "The four core configurations (ZS, IC alone, Cascade alone, IC+Cascade) constitute a structured ablation isolating the contribution of in-context distillation and self-consistency separately.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Results are reported on both task accuracy (success rate) and inference cost (USD, normalized); Appendix C also reports token counts by type.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "The benchmarks use automated evaluation (unit tests for AppWorld, goal completion for ALFWorld); human evaluation is not applicable.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "ALFWorld uses eval-out-of-distribution split; AppWorld uses the test-normal split; demonstrations are drawn from separate training splits (Tdemo ≠ Ttest).",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Table 2 provides per-difficulty-level breakdowns for AppWorld (levels 1–3); Figures 3 and 4 show per-k and per-database-size breakdowns.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "The paper discusses that difficulty-3 AppWorld tasks show significant degradation (43% vs teacher's 71%) and notes fine-tuning produced 0% success on AppWorld despite training loss convergence.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Fine-tuning Llama-3.3-70B on AppWorld teacher demonstrations converged on training loss but achieved 0% task success, and the paper reports this openly as evidence of fine-tuning complexity.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Specific model names are given: Claude Sonnet 4.5 (teacher), GPT-4.1-mini (student), GPT-4.1, Llama-3.3-70B; API pricing as of October 2025 is cited, providing temporal anchoring.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Appendix B provides the full system and user prompts for Plan, ReAct, and Verifier roles, including template variables.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Key hyperparameters are reported: temperature=0.1, k=6 (ALFWorld) and k=3 (AppWorld) retrieved examples, N=3 self-consistency samples, max 4096 output tokens.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The ReAct-style agent loop is described formally in Algorithm 1, the cascade logic in Algorithm 2, and the retrieval mechanism (multi-key, MiniLM-L6-v2 embeddings, cosine similarity) is specified in Section 4.2–4.3.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "The demonstration database construction is described: teacher trajectories are embedded with MiniLM-L6-v2, indexed for cosine-similarity retrieval; goal, plan, and per-step reasoning are embedded separately.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "No raw experimental data (episode logs, token counts, per-episode results) is released; only aggregate statistics appear in the paper.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "The demonstration collection phase is described: 500 teacher trajectories from ALFWorld train split, 147 from AppWorld train/val splits, executed using the teacher LLM.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants — standard benchmarks with automated evaluation.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The pipeline from teacher execution → trajectory storage → embedding/indexing → retrieval → student inference → consistency check is described via Algorithms 1–2 and Sections 4.2–4.4.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Training cutoffs for Claude Sonnet 4.5 and GPT-4.1-mini are not stated; both ALFWorld and AppWorld are publicly available benchmarks that could be in model training data.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "Potential overlap between public benchmarks and model training data is never discussed, despite using standard public benchmarks with closed-weight models.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "Neither ALFWorld (2020) nor AppWorld (2024) contamination relative to model training data is addressed; the paper does not discuss whether models may have seen these tasks during pre-training.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Inference costs are reported in detail: per-episode USD costs, normalized costs, API pricing as of October 2025, and full token breakdowns in Appendix C Tables 4–5.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Per-episode costs and demonstration collection costs are given, but the total compute budget for all experiments (including all ablations across k values, database sizes, multiple benchmarks) is not stated.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "In-context distillation with self-consistency cascades achieves 2.5× cost reduction at teacher-level accuracy on ALFWorld (96% vs teacher's 89%).",
    375       "evidence": "Table 1 shows Student (IC+Cascade) at 0.96 accuracy, 0.41 relative cost vs teacher 0.89/1.0; Figure 2 shows the Pareto-frontier shift.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "The method achieves 2× cost reduction at iso-accuracy on AppWorld, recovering 79% of teacher performance.",
    380       "evidence": "Table 1: Student (IC+Cascade) 0.66 accuracy at 0.29 relative cost vs teacher 0.82/1.0; Figure 2 shows the shift in the Pareto frontier.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "In-context distillation alone (without cascading) dramatically bridges the teacher-student gap without any parameter updates.",
    385       "evidence": "Student (IC) on ALFWorld: 0.87 accuracy at 0.43 cost, versus zero-shot 0.18; on AppWorld: 0.55 vs 0.28 zero-shot.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Per-step dynamic retrieval achieves equivalent accuracy to single-trajectory retrieval at substantially lower cost.",
    390       "evidence": "Table 3: both achieve 0.87 accuracy on ALFWorld, but per-step costs 0.43 vs single-retrieval's 0.54 of teacher cost (26% reduction).",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "As few as 100 teacher demonstrations yield 94% of teacher accuracy on ALFWorld, demonstrating strong sample efficiency.",
    395       "evidence": "Figure 4: 100-demonstration database yields 0.836 accuracy vs teacher's 0.89; 500 demonstrations close gap to 98%.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "The method generalizes to open-weight models; Llama-3.3-70B benefits similarly from in-context distillation and cascades.",
    400       "evidence": "Table 1: Llama-3.3-70B (IC+Cascade) achieves 0.93 on ALFWorld (vs 0.50 zero-shot) and 0.44 on AppWorld (vs 0.11 zero-shot).",
    401       "supported": "moderate"
    402     },
    403     {
    404       "claim": "Demonstration collection costs amortize after 843 episodes on ALFWorld, yielding $34,900 savings at 1M episode scale.",
    405       "evidence": "Appendix C.5: $29.50 demo cost / $0.035 per-episode savings = 843 breakeven; arithmetic is internally consistent given stated API prices.",
    406       "supported": "strong"
    407     }
    408   ],
    409   "methodology_tags": [
    410     "benchmark-eval"
    411   ],
    412   "key_findings": "In-context distillation — retrieving teacher LLM reasoning traces as dynamic in-context examples at each agent step — dramatically bridges the teacher-student accuracy gap without any parameter updates, lifting a zero-shot student from 18% to 87% on ALFWorld and from 28% to 55% on AppWorld. Combining this with self-consistency cascades (defer to teacher only when N=3 student samples disagree) achieves 2.5× cost reduction at teacher-level accuracy on ALFWorld and 2× reduction at 79% teacher accuracy on AppWorld. Per-step retrieval of short relevant trajectory windows is more cost-effective than static full-trajectory retrieval, achieving the same accuracy at 26% lower cost. The approach generalizes across closed-weight (GPT-4.1-mini) and open-weight (Llama-3.3-70B) students and requires minimal upfront cost, amortizing on ALFWorld after only 843 episodes.",
    413   "red_flags": [
    414     {
    415       "flag": "No variance / single runs",
    416       "detail": "All accuracy and cost results are single-run point estimates; no repeated runs, confidence intervals, or standard deviations are reported, making it impossible to assess whether differences are reliable."
    417     },
    418     {
    419       "flag": "Benchmark contamination unaddressed",
    420       "detail": "Both ALFWorld (2020) and AppWorld (2024) are public benchmarks; training data cutoffs for Claude Sonnet 4.5 and GPT-4.1-mini are not stated and possible overlap with model training data is never discussed."
    421     },
    422     {
    423       "flag": "Exceeds-teacher accuracy unexplained",
    424       "detail": "Student IC+Cascade achieves 96% vs teacher's 89% on ALFWorld; the paper attributes this to retrieved examples providing cross-trajectory environment knowledge, but no ablation tests this explanation and the difference lacks significance testing."
    425     },
    426     {
    427       "flag": "No code released",
    428       "detail": "Despite the method being described as simple and training-free, no implementation code is released, making independent reproduction difficult."
    429     },
    430     {
    431       "flag": "Funder–author overlap",
    432       "detail": "Reve is both a funder and the affiliation of two co-authors (Hong, Gharbi); the method's commercial value aligns directly with Reve's business interests in cost-efficient agentic deployment."
    433     },
    434     {
    435       "flag": "Two-benchmark generalization",
    436       "detail": "Abstract and conclusion make broad claims about making 'advanced agentic systems economically viable,' but evidence is from only two benchmarks that were selected for having structural patterns — a favorable condition not representative of all agent tasks."
    437     }
    438   ],
    439   "cited_papers": [
    440     {
    441       "title": "ALFWorld: Aligning Text and Embodied Environments for Interactive Learning",
    442       "relevance": "Primary evaluation benchmark for embodied multi-step reasoning; provides the standard out-of-distribution test split used to measure cost-accuracy tradeoffs."
    443     },
    444     {
    445       "title": "AppWorld: A Controllable World of Apps and People for Benchmarking Interactive Coding Agents",
    446       "relevance": "Primary evaluation benchmark for multi-step API workflow automation; provides difficulty-stratified tasks used to analyze method performance at varying complexity."
    447     },
    448     {
    449       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    450       "relevance": "Foundational agent architecture that the paper builds upon for all agent experiments."
    451     },
    452     {
    453       "title": "Self-consistency improves chain of thought reasoning in language models",
    454       "relevance": "Core technique adapted for cascade deferral: agreement among multiple samples signals confidence, disagreement triggers teacher fallback."
    455     },
    456     {
    457       "title": "FrugalGPT: How to Use Large Language Models while Reducing Cost and Improving Performance",
    458       "relevance": "Key prior work on LLM cost reduction via cascades and routing; provides direct baseline for comparing cost-accuracy tradeoff approaches."
    459     },
    460     {
    461       "title": "Large language model cascades with mixture of thoughts",
    462       "relevance": "Direct precedent for self-consistency-based cascade routing in LLM settings; the paper adapts this approach to multi-step agent contexts."
    463     },
    464     {
    465       "title": "Self-generated in-context examples improve LLM agents for sequential decision-making tasks",
    466       "relevance": "Closely related work on retrieval-augmented agent improvement; key distinction is this paper uses teacher-generated rather than self-generated experience."
    467     },
    468     {
    469       "title": "ExPeL: LLM Agents are Experiential Learners",
    470       "relevance": "Related approach to agent improvement via experience retrieval; compared against as a baseline for agent self-improvement methods."
    471     }
    472   ],
    473   "engagement_factors": {
    474     "practical_relevance": {
    475       "score": 3,
    476       "justification": "Directly addresses LLM agent deployment cost with a training-free method requiring only API access, immediately applicable to any practitioner running agents at scale."
    477     },
    478     "surprise_contrarian": {
    479       "score": 2,
    480       "justification": "Reframing distillation as a retrieval/in-context operation rather than weight updates is conceptually novel, and the finding that a student can exceed teacher accuracy is counterintuitive."
    481     },
    482     "fear_safety": {
    483       "score": 0,
    484       "justification": "No AI safety or risk concerns are raised; the paper focuses purely on cost efficiency."
    485     },
    486     "drama_conflict": {
    487       "score": 0,
    488       "justification": "No controversy or competing claims; the paper is a straightforward methods contribution."
    489     },
    490     "demo_ability": {
    491       "score": 2,
    492       "justification": "The method can be implemented with standard API access and public benchmarks, though no code is released to enable immediate replication."
    493     },
    494     "brand_recognition": {
    495       "score": 1,
    496       "justification": "Stanford affiliation but not a major AI lab paper; Reve is not widely recognized."
    497     }
    498   },
    499   "hn_data": {
    500     "threads": [
    501       {
    502         "hn_id": "45854862",
    503         "title": "Making Democracy Work: Fixing and Simplifying Egalitarian Paxos",
    504         "points": 180,
    505         "comments": 56,
    506         "url": "https://news.ycombinator.com/item?id=45854862",
    507         "created_at": "2025-11-08T07:29:35Z"
    508       },
    509       {
    510         "hn_id": "46289799",
    511         "title": "Beaver: An Efficient Deterministic LLM Verifier",
    512         "points": 1,
    513         "comments": 1,
    514         "url": "https://news.ycombinator.com/item?id=46289799",
    515         "created_at": "2025-12-16T15:33:02Z"
    516       },
    517       {
    518         "hn_id": "39125875",
    519         "title": "Are Vision Transformers More Data Hungry Than Newborn Visual Systems?",
    520         "points": 1,
    521         "comments": 0,
    522         "url": "https://news.ycombinator.com/item?id=39125875",
    523         "created_at": "2024-01-25T03:21:20Z"
    524       }
    525     ],
    526     "top_points": 180,
    527     "total_points": 182,
    528     "total_comments": 57
    529   }
    530 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs