scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26019B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Grading Scale Impact on LLM-as-a-Judge: Human-LLM Alignment Is Highest on 0-5 Grading Scale",
      6     "authors": [
      7       "Weiyue Li",
      8       "Minda Zhao",
      9       "Weixuan Dong",
     10       "Jiahui Cai",
     11       "Yuze Wei",
     12       "Michael Pocress",
     13       "Yi Li",
     14       "Wanyan Yuan",
     15       "Xiaoyue Wang",
     16       "Ruoyu Hou",
     17       "Kaiyuan Lou",
     18       "Wenqi Zeng",
     19       "Yutong Yang",
     20       "Yilun Du",
     21       "Mengyu Wang"
     22     ],
     23     "year": 2026,
     24     "venue": "arXiv.org",
     25     "arxiv_id": "2601.03444",
     26     "doi": "10.48550/arXiv.2601.03444"
     27   },
     28   "checklist": {
     29     "claims_and_evidence": {
     30       "abstract_claims_supported": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "All abstract claims—LLM inconsistency across scales on subjective benchmarks, 0-5 maximizing alignment, 0-10 being weakest, pooled reliability masking heterogeneity, and gender subgroup differences—are backed by Tables 1–5.",
     34         "source": "haiku"
     35       },
     36       "causal_claims_justified": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The claim that scale choice 'shifts' human-LLM agreement is supported by a fully-crossed design where the same 12 humans and 6 LLMs rate the same 150 items on all three scales, isolating scale as the only varying factor; a temperature ablation (Table 6) further rules out stochasticity.",
     40         "source": "haiku"
     41       },
     42       "generalization_bounded": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The limitations section explicitly bounds the findings to graduate-student annotators and notes that the best scale can be benchmark-dependent (MT-Bench diverges); the abstract itself states results are 'aggregated over tasks.'",
     46         "source": "haiku"
     47       },
     48       "alternative_explanations_discussed": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper rules out decoding stochasticity via the temperature ablation but does not discuss alternative explanations for why 0-5 specifically outperforms 0-10, such as cognitive load differences, clustering effects, or annotator anchoring strategies.",
     52         "source": "haiku"
     53       },
     54       "proxy_outcome_distinction": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The paper explicitly defines its outcome measures (ICC for absolute agreement, nMAE for absolute deviation) and motivates why ICC is preferred over Pearson correlation, clearly distinguishing what is measured from what is claimed.",
     58         "source": "haiku"
     59       }
     60     },
     61     "limitations_and_scope": {
     62       "limitations_section_present": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "A dedicated 'Limitations' section appears on page 8, separate from the conclusion, with multiple specific points.",
     66         "source": "haiku"
     67       },
     68       "threats_to_validity_specific": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Specific threats named include: annotator pool limited to graduate students whose educational background and LLM familiarity may skew calibration; underspecified or knowledge-intensive items causing human uncertainty that depresses ICC estimates.",
     72         "source": "haiku"
     73       },
     74       "scope_boundaries_stated": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper explicitly states the conclusion is aggregate and that 'the best scale can be benchmark-dependent (e.g., MT-Bench)', and that results should not generalize beyond their annotator population.",
     78         "source": "haiku"
     79       }
     80     },
     81     "conflicts_of_interest": {
     82       "funding_disclosed": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "No funding acknowledgment section or grant disclosure appears anywhere in the paper text.",
     86         "source": "haiku"
     87       },
     88       "affiliations_disclosed": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Author affiliations (Harvard University, CMU, Stanford University, UC San Diego) are stated on the title page.",
     92         "source": "haiku"
     93       },
     94       "funder_independent_of_outcome": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "No funder is disclosed, so independence cannot be assessed.",
     98         "source": "haiku"
     99       },
    100       "financial_interests_declared": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No competing interests statement or financial disclosure appears in the paper.",
    104         "source": "haiku"
    105       }
    106     },
    107     "scope_and_framing": {
    108       "key_terms_defined": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "ICC is defined via two-way random-effects ANOVA with explicit formulas (Equations 1–3); nMAE is formally defined (Equations 4–5); 'human-LLM alignment' is operationalized as ICC(A,1) on a 2-column matrix of human-consensus vs. LLM-ensemble scores.",
    112         "source": "haiku"
    113       },
    114       "intended_contribution_clear": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The introduction explicitly states the paper fills a gap by 'investigating inter-scale consistency and its impact on human-LLM agreement' and is 'the first to assess how the choice of score scale affects human-LLM alignment.'",
    118         "source": "haiku"
    119       },
    120       "engagement_with_prior_work": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The related work section positions the paper against Lee et al. (2025)'s Pearson-correlation approach, explains why ICC is superior, and maps contributions onto multiple existing threads (judge bias, consistency, human alignment).",
    124         "source": "haiku"
    125       }
    126     }
    127   },
    128   "type_checklist": {
    129     "empirical": {
    130       "artifacts": {
    131         "code_released": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No code repository or release is mentioned anywhere in the paper.",
    135           "source": "haiku"
    136         },
    137         "data_released": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "The six benchmarks used are public, but the primary data contribution—12 annotators' human ratings across all scales—is not stated to be released.",
    141           "source": "haiku"
    142         },
    143         "environment_specified": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "No requirements file, Docker image, or software environment specification is provided.",
    147           "source": "haiku"
    148         },
    149         "reproduction_instructions": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "No step-by-step reproduction instructions are included; the appendices provide prompts and annotation instructions but not a runnable pipeline.",
    153           "source": "haiku"
    154         }
    155       },
    156       "statistical_methodology": {
    157         "confidence_intervals_or_error_bars": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "All ICC and nMAE values in Tables 1–6 are point estimates with no confidence intervals or bootstrapped error ranges reported.",
    161           "source": "haiku"
    162         },
    163         "significance_tests": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "No statistical significance tests are applied to scale comparisons (e.g., no test that 0-5 ICC=0.853 is significantly higher than 0-10 ICC=0.805).",
    167           "source": "haiku"
    168         },
    169         "effect_sizes_reported": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "ICC values are reported numerically (e.g., 0-5: 0.853 vs 0-10: 0.805 in Table 2), providing interpretable magnitude of differences in a standardized metric.",
    173           "source": "haiku"
    174         },
    175         "sample_size_justified": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "No justification is given for 12 annotators or 150 items; no power analysis is presented for the ICC comparisons.",
    179           "source": "haiku"
    180         },
    181         "variance_reported": {
    182           "applies": true,
    183           "answer": false,
    184           "justification": "Per-benchmark breakdowns show variability across benchmarks but no within-cell variance or confidence bounds are reported for any ICC estimate.",
    185           "source": "haiku"
    186         }
    187       },
    188       "evaluation_design": {
    189         "baselines_included": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "All three scales (0-5, 0-10, 0-100) serve as mutual comparators; all six models are compared against each other and against the human panel.",
    193           "source": "haiku"
    194         },
    195         "baselines_contemporary": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "LLM judges include GPT-4o, Gemini-2.5-flash, Llama-3.3-70B, Qwen3-32B, DeepSeek-v3.2, and Mistral-7B—all contemporary models.",
    199           "source": "haiku"
    200         },
    201         "ablation_study": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "Section 5.3 presents an explicit temperature ablation (T∈{0.1,0.4,0.7,1.0}) for Llama and Gemini, showing scale ordering is stable across decoding strategies.",
    205           "source": "haiku"
    206         },
    207         "multiple_metrics": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "Both ICC (absolute agreement) and nMAE (mean absolute deviation normalized by scale range) are reported throughout Tables 2–5.",
    211           "source": "haiku"
    212         },
    213         "human_evaluation": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "12 graduate student annotators rated all 150 items on all three scales in a fully-crossed design, with separate sessions per scale spaced at least one week apart.",
    217           "source": "haiku"
    218         },
    219         "held_out_test_set": {
    220           "applies": false,
    221           "answer": false,
    222           "justification": "This is a reliability/agreement study, not a prediction task; held-out test sets are not applicable.",
    223           "source": "haiku"
    224         },
    225         "per_category_breakdown": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Results are broken down per benchmark (Tables 3, 4a), per model (Table 4b), and per gender subgroup (Table 5).",
    229           "source": "haiku"
    230         },
    231         "failure_cases_discussed": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "Section 5.4 and Appendix G (Tables 16–17) present representative poorly-aligned and well-aligned cases with explanations of why disagreements occur.",
    235           "source": "haiku"
    236         },
    237         "negative_results_reported": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "0-10 is reported as consistently weakest; LLM judges perform poorly on MT-Bench and SummEval; MT-Bench shows scale-dependent behavior that does not uniformly favor 0-5.",
    241           "source": "haiku"
    242         }
    243       },
    244       "setup_transparency": {
    245         "model_versions_specified": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Exact model versions are stated: gpt-4o, gemini-2.5-flash (Comanici et al. 2025), Llama-3.3-70B-Instruct, Qwen3-32B, DeepSeek-v3.2, Mistral-7B-Instruct-v0.3.",
    249           "source": "haiku"
    250         },
    251         "prompts_provided": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Full prompts for all six benchmarks are provided in Appendices C–D (Tables 8–13), including both system and user messages with all placeholders identified.",
    255           "source": "haiku"
    256         },
    257         "hyperparameters_reported": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Temperature=1 and non-thinking mode are stated for all LLM judges; fractional scoring is allowed for both humans and LLMs.",
    261           "source": "haiku"
    262         },
    263         "scaffolding_described": {
    264           "applies": false,
    265           "answer": false,
    266           "justification": "No agentic scaffolding is used; LLMs are called directly with single prompts.",
    267           "source": "haiku"
    268         },
    269         "data_preprocessing_documented": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "Sampling procedure (25 items per benchmark, 150 total), randomization of item and scale order, and session spacing (≥1 week between scale blocks) are documented.",
    273           "source": "haiku"
    274         }
    275       },
    276       "data_integrity": {
    277         "raw_data_available": {
    278           "applies": true,
    279           "answer": false,
    280           "justification": "Neither the human annotation scores nor the LLM-generated scores are stated to be publicly released.",
    281           "source": "haiku"
    282         },
    283         "data_collection_described": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "Data collection is described in detail: Label Studio platform, voluntary recruitment, informed consent, randomized item order, separate sessions per scale spaced ≥1 week apart.",
    287           "source": "haiku"
    288         },
    289         "recruitment_methods_described": {
    290           "applies": true,
    291           "answer": true,
    292           "justification": "Appendix A describes voluntary recruitment from graduate students at multiple institutions, fully informed consent, no identifying information collected, and annotator awareness of potentially sensitive content.",
    293           "source": "haiku"
    294         },
    295         "data_pipeline_documented": {
    296           "applies": true,
    297           "answer": true,
    298           "justification": "The pipeline from raw scores to ICC computation via two-way random-effects ANOVA is described with equations; the LLM querying pipeline (standardized templates, single numeric output) is described, though no code is released.",
    299           "source": "haiku"
    300         }
    301       },
    302       "contamination": {
    303         "training_cutoff_stated": {
    304           "applies": false,
    305           "answer": false,
    306           "justification": "This study evaluates calibration and agreement properties of LLM judges, not downstream benchmark performance; training cutoff contamination is not applicable.",
    307           "source": "haiku"
    308         },
    309         "train_test_overlap_discussed": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "NA — the study does not evaluate model capabilities on novel benchmarks; the benchmarks are used as stimulus items for rating behavior.",
    313           "source": "haiku"
    314         },
    315         "benchmark_contamination_addressed": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "NA — potential prior exposure to benchmark items is not the focus; the study assesses rating calibration, not knowledge recall.",
    319           "source": "haiku"
    320         }
    321       },
    322       "human_studies": {
    323         "pre_registered": {
    324           "applies": true,
    325           "answer": false,
    326           "justification": "No pre-registration is mentioned anywhere in the paper.",
    327           "source": "haiku"
    328         },
    329         "irb_or_ethics_approval": {
    330           "applies": true,
    331           "answer": false,
    332           "justification": "Appendix A mentions informed consent and voluntary participation but does not mention IRB approval or institutional ethics review.",
    333           "source": "haiku"
    334         },
    335         "demographics_reported": {
    336           "applies": true,
    337           "answer": false,
    338           "justification": "Only gender (6 female, 6 male) and academic status (graduate students) are reported; no age, race, institution, or domain expertise breakdown is provided.",
    339           "source": "haiku"
    340         },
    341         "inclusion_exclusion_criteria": {
    342           "applies": true,
    343           "answer": false,
    344           "justification": "No explicit inclusion or exclusion criteria for annotator selection are stated beyond 'graduate students across multiple institutions.'",
    345           "source": "haiku"
    346         },
    347         "randomization_described": {
    348           "applies": true,
    349           "answer": true,
    350           "justification": "Item order within each scale block is shuffled, and scale block order is randomized across annotators to reduce anchoring and fatigue effects.",
    351           "source": "haiku"
    352         },
    353         "blinding_described": {
    354           "applies": true,
    355           "answer": false,
    356           "justification": "No blinding of annotators to scale conditions or study hypotheses is mentioned.",
    357           "source": "haiku"
    358         },
    359         "attrition_reported": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "All 12 annotators appear to have completed the study, but dropout rates and any exclusions are not explicitly reported.",
    363           "source": "haiku"
    364         }
    365       },
    366       "cost_and_practicality": {
    367         "inference_cost_reported": {
    368           "applies": true,
    369           "answer": false,
    370           "justification": "No inference cost, API call counts, or latency measurements are reported for any of the six LLM judges.",
    371           "source": "haiku"
    372         },
    373         "compute_budget_stated": {
    374           "applies": true,
    375           "answer": false,
    376           "justification": "Total computational budget (GPU hours, API cost) is not stated anywhere in the paper.",
    377           "source": "haiku"
    378         }
    379       }
    380     }
    381   },
    382   "claims": [
    383     {
    384       "claim": "The 0-5 grading scale yields the highest human-LLM alignment aggregated over six benchmarks (ICC=0.853, nMAE=0.111), while 0-10 is consistently the weakest (ICC=0.805, nMAE=0.122).",
    385       "evidence": "Table 2 reports pooled human-LLM ICC and nMAE across all three scales; the ordering is consistent across both metrics.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "LLM inter-scale agreement degrades substantially on subjective, open-ended benchmarks (MT-Bench avg ICC ~0.740, SummEval ~0.786) but remains high on objective-like benchmarks (STS-B ~0.944, ToxiGen ~0.949).",
    390       "evidence": "Table 1 reports per-benchmark, per-model inter-scale ICC across all three scales and pairwise comparisons.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Pooled LLM panel reliability appears excellent (~0.944–0.950) but is dominated by objective benchmarks, masking much lower reliability on subjective tasks (SummEval LLM ICC=0.573, MT-Bench=0.632 at 0-5 scale).",
    395       "evidence": "Table 3 shows per-benchmark internal reliability on the 0-5 scale with a delta column (ICCHuman − ICCLLM) reaching 0.357 for SummEval.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Scale-dependent calibration differences dominate over decoding stochasticity: the 0-5 > 0-100 > 0-10 ordering is stable across temperatures T∈{0.1,0.4,0.7,1.0}.",
    400       "evidence": "Table 6 reports per-model per-scale human-LLM ICC at four temperatures for Llama and Gemini.",
    401       "supported": "strong"
    402     },
    403     {
    404       "claim": "Female raters align more with LLMs on the 0-10 scale whereas male raters align more on 0-5 and 0-100.",
    405       "evidence": "Table 5 shows ICCFemale-LLM=0.805 on 0-10 vs ICCMale-LLM=0.751; ICCMale-LLM=0.839 on 0-5 vs ICCFemale-LLM=0.831.",
    406       "supported": "moderate"
    407     },
    408     {
    409       "claim": "GPT achieves the strongest pooled human alignment across all scales (0-5: 0.816), followed by Gemini (0.782); Mistral is weakest (0.596).",
    410       "evidence": "Table 4b reports model-wise human-LLM ICC at each scale; GPT leads on all three scales.",
    411       "supported": "strong"
    412     }
    413   ],
    414   "methodology_tags": [
    415     "benchmark-eval",
    416     "observational"
    417   ],
    418   "key_findings": "Grading scale is a significant but underexplored parameter in LLM-as-a-judge protocols: the 0-5 scale achieves the highest absolute human-LLM agreement (ICC=0.853) while 0-10 is consistently weakest, even when within-group panel reliability is near-perfect on all scales. LLM inter-scale consistency degrades substantially on subjective open-ended benchmarks (MT-Bench, SummEval) but remains high on objective-like benchmarks, creating a 'reliability illusion' where pooled metrics conceal benchmark-specific failures. Gender subgroup analysis reveals that female and male annotators differ in which scale best aligns with LLM judgments, underscoring that scale effects are not uniform across annotator subpopulations. These findings motivate treating scale design as a controllable, diagnostically important component of LLM evaluation protocols.",
    419   "red_flags": [
    420     {
    421       "flag": "Underpowered gender subgroup analysis",
    422       "detail": "Gender comparisons are drawn from 6 raters per group; ICC estimates from 6 raters are highly unstable and no confidence intervals or significance tests accompany the reported differences."
    423     },
    424     {
    425       "flag": "No statistical significance tests on scale comparisons",
    426       "detail": "The central claim that 0-5 > 0-100 > 0-10 is never tested for significance; ICC differences of ~0.05 between scales could be within sampling noise given 150 items and 12 raters."
    427     },
    428     {
    429       "flag": "No confidence intervals on ICC estimates",
    430       "detail": "All ICC values are point estimates; ICC CIs can be wide with small rater pools and are standard in psychometric practice (Koo & Li 2016 cited by this paper explicitly recommends them)."
    431     },
    432     {
    433       "flag": "Human annotations not released",
    434       "detail": "The primary empirical contribution—human ratings across all scales and benchmarks—is not made available, preventing independent replication."
    435     },
    436     {
    437       "flag": "No IRB disclosure",
    438       "detail": "The study involves human participants rating potentially offensive content (ToxiGen, MoralChoice) but does not report IRB approval."
    439     },
    440     {
    441       "flag": "Graduate student convenience sample",
    442       "detail": "All 12 annotators are graduate students with LLM familiarity; this pool likely differs systematically from crowd workers or domain experts who are the actual target population for LLM-as-a-judge deployment."
    443     }
    444   ],
    445   "cited_papers": [
    446     {
    447       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    448       "relevance": "Introduces MT-Bench and the LLM-as-a-judge paradigm; core benchmark and methodology used directly in this work."
    449     },
    450     {
    451       "title": "Evaluating the Consistency of LLM Evaluators",
    452       "relevance": "Prior work on LLM evaluator consistency under prompt perturbation; directly compared against—uses Pearson correlation where this paper argues for ICC."
    453     },
    454     {
    455       "title": "A Guideline of Selecting and Reporting Intraclass Correlation Coefficients for Reliability Research",
    456       "relevance": "Methodological foundation for ICC use; the paper's primary statistical tool is ICC(A,1) and ICC(A,k) as specified here."
    457     },
    458     {
    459       "title": "G-Eval: NLG Evaluation Using GPT-4 with Better Human Alignment",
    460       "relevance": "Representative LLM-as-a-judge paper that this work positions against; uses fixed scales without inter-scale analysis."
    461     },
    462     {
    463       "title": "Validating LLM-as-a-Judge Systems Under Rating Indeterminacy",
    464       "relevance": "Concurrent work on LLM judge reliability under discrete Likert scales; cited as motivation for examining scale effects."
    465     },
    466     {
    467       "title": "Are We on the Right Way to Assessing LLM-as-a-Judge?",
    468       "relevance": "Shows ~25% inconsistency in pairwise LLM judge scoring; motivates rigorous measurement of judge reliability."
    469     },
    470     {
    471       "title": "Intraclass Correlations: Uses in Assessing Rater Reliability",
    472       "relevance": "Original ICC formulation (Shrout & Fleiss 1979); foundational statistical reference for the paper's methodology."
    473     },
    474     {
    475       "title": "LLM Evaluators Recognize and Favor Their Own Generations",
    476       "relevance": "Documents self-preference bias in LLM judges; part of the systematic bias landscape this work situates itself within."
    477     },
    478     {
    479       "title": "SummEval: Re-Evaluating Summarization Evaluation",
    480       "relevance": "One of the six core benchmarks used; the subjective benchmark where human-LLM agreement is weakest in this study."
    481     },
    482     {
    483       "title": "ChatEval: Towards Better LLM-Based Evaluators Through Multi-Agent Debate",
    484       "relevance": "Multi-agent judge framework alternative to single-judge scoring; cited as a competing approach this work complements."
    485     }
    486   ],
    487   "engagement_factors": {
    488     "practical_relevance": {
    489       "score": 3,
    490       "justification": "Directly actionable: anyone running LLM-as-a-judge evaluations should use a 0-5 scale rather than the common 0-10 scale based on this finding."
    491     },
    492     "surprise_contrarian": {
    493       "score": 2,
    494       "justification": "The finding that finer-grained 0-10 and 0-100 scales yield worse human alignment than 0-5 is counterintuitive given psychometric intuitions about granularity."
    495     },
    496     "fear_safety": {
    497       "score": 0,
    498       "justification": "No AI safety or risk concerns raised; this is a measurement methodology paper."
    499     },
    500     "drama_conflict": {
    501       "score": 1,
    502       "justification": "Challenges the tacit convention of using 0-10 scales in prominent benchmarks like MT-Bench, but framed constructively rather than confrontationally."
    503     },
    504     "demo_ability": {
    505       "score": 2,
    506       "justification": "Practitioners could immediately reproduce the comparison by running any LLM on public benchmarks with the provided prompts across all three scales."
    507     },
    508     "brand_recognition": {
    509       "score": 1,
    510       "justification": "Authors from Harvard, CMU, Stanford, and UCSD lend some institutional credibility, but no famous lab product or named system is involved."
    511     }
    512   },
    513   "hn_data": {
    514     "threads": [],
    515     "top_points": 0,
    516     "total_points": 0,
    517     "total_comments": 0
    518   }
    519 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs