scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27190B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Detecting Proxy Gaming in RL and LLM Alignment via Evaluator Stress Tests",
      6     "authors": [
      7       "Ibne Farabi Shihab",
      8       "Sanjeda Akter",
      9       "Anuj Sharma"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv",
     13     "arxiv_id": "2507.05619",
     14     "doi": null
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "Abstract claims of 78.4%/81.7% precision/recall (RL), 74.2%/78.6% (LLM), 8.3-point win-rate improvement, and 54.6% hacking reduction are all supported by detailed experimental tables (Tables 1, 7, 9, 11).",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Causal claims about mitigation effectiveness are supported by ablation studies and control experiments (Table 30) showing that extra compute alone yields only +2.1% vs. +8.3% for detector-triggered intervention.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Limitations section explicitly bounds generalization to '4 tasks and 2 model sizes' and acknowledges that 'real-world deployment would face additional challenges,' appropriately scoping the claims.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Control experiments (Table 30) rule out extra compute and filtering as explanations for win-rate gains; Appendix N discusses false positive patterns including beneficial exploration misclassified as hacking.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper is explicitly about this distinction — judge scores (proxy) vs. human preferences (true objective) — and carefully measures both throughout; Table 2 shows divergence between judge score and human rating on case studies.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 8 'Limitations' is a dedicated section, not a sentence in the conclusion, discussing scope constraints on tasks, model sizes, and deployment conditions.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Limitations specifically name '4 tasks and 2 model sizes,' fixed judges assumption, concept drift, multi-stakeholder objective conflicts, and adversarial adaptation over longer horizons — not generic disclaimers.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper explicitly states 'mitigation results represent controlled experimental conditions' and that 'larger-scale validation across more diverse domains and model architectures would strengthen generalizability claims.'",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding acknowledgment appears anywhere in the paper; the Acknowledgments section only mentions AI writing tools, not any funding source.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors' affiliations with Iowa State University departments (CS and Civil/Construction/Environmental Engineering) are disclosed on the title page.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No external funder is disclosed, so this criterion is not applicable.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement or financial disclosure appears in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms including 'proxy gaming,' 'exploitable sensitivity,' 'content sensitivity,' and the formal G(y) statistic are precisely defined with mathematical notation in Section 3.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper clearly states its contribution as a unified framework (EST) for detecting proxy gaming in both RL and LLM alignment, with validated benchmarks for both domains.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 substantively engages with RLHF, DPO, reward hacking detection, and LLM-as-judge evaluation literature, explicitly positioning EST's contributions relative to prior approaches' scalability and principled-framework limitations.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "No code repository URL is provided. The paper claims benchmark data release but does not mention releasing the detection framework implementation code.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "The paper claims 'We release benchmarks for both domains' but provides no URL, DOI, or repository location where the 2,156 RL episodes or 1,200 LLM instances can be accessed.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "Only the GPU model (NVIDIA A6000 48GB VRAM) is mentioned; no requirements.txt, Dockerfile, or dependency specifications are provided.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step reproduction instructions are provided; the paper describes the methodology but not how to replicate the experiments end-to-end.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Tables 7 and 11 report values with ± notation; Table 11 explicitly states '95% CI across 5-fold cross-validation' (e.g., Precision: 0.784±0.027).",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Table 12 reports p-values for all factorial design effects (e.g., Objective Alignment p < 0.001); inter-rater reliability uses Cohen's κ and Fleiss' κ throughout.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Table 12 reports Cohen's d for all experimental factors (e.g., Objective Alignment: Cohen's d = 2.08); the paper explicitly notes these are unusually large due to custom environment design.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "Sample sizes (2,156 RL episodes, 1,200 LLM instances) are described but not formally justified with power analysis or sample size calculations.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "Tables 7, 11, 18, and 29 consistently report ± standard deviation values alongside mean performance metrics.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Extensive baseline comparisons in Tables 3, 16, and 29 include length-only, format features, KL regularization, judge ensembling, LSTM-Autoencoder, One-Class SVM, Isolation Forest, reward model ensemble disagreement, and probe-based detection.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Baselines include contemporary methods like reward model ensemble disagreement (F1 0.687), probe-based detection, hardened judges, and established ML anomaly detection methods — not outdated or trivially weak.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Ablation studies in Tables 17 and 29 systematically remove individual detection components (EST, correlation tracking, reasoning validity, format perturbation, content perturbation) measuring each contribution.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Results reported across precision, recall, F1, AUC-ROC, early warning latency (checkpoints), computational overhead %, human win-rate, and judge-human correlation.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "Human evaluation is central: 1,200 human-annotated LLM gaming instances with 3 raters achieving Fleiss' κ ≥ 0.78, and 2,156 expert-annotated RL episodes with Cohen's κ = 0.847.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "Section 4.1 states 'strict train-validation-test splits, holding out entire task-model-judge combinations for testing'; RL uses environment-stratified splits with 5-fold cross-validation.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Table 11 provides per-category RL detection performance (6 hacking categories); Table 1 provides per-task, per-model-size, and per-judge breakdowns for all 32 LLM conditions.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Appendix N provides dedicated error and boundary case analysis, manually examining 100 classification errors (50 false positives, 50 false negatives) with representative qualitative examples.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Adaptive evasion tests (Table 24) show precision dropping from 74.2% to 65.9% under white-box attacks; zero-shot cross-environment transfer shows 10-15 F1 point degradation (Table 32).",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "GPT-4 is used as a judge without specifying version (e.g., gpt-4-0314, gpt-4-turbo) or snapshot date; Llama-3-8B/70B are named but no model card versions are pinned.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "No actual prompts or system instructions are provided for LLM fine-tuning, judge evaluation, or the EST perturbation generation steps.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Key hyperparameters are reported: detection threshold τ=0.6, τspec=0.3, correlation threshold ∆ρ=0.5, contamination γ=0.1, window size W=50, format penalty 20%, mitigation threshold α=0.1.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "The online detection pipeline is described in detail including Algorithm 1 for detector-triggered mitigation, the 6-detector ensemble structure with complexity analysis (Appendix D), and per-checkpoint monitoring protocol (Table 14).",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Transformation validity thresholds (cosine similarity >0.85, NLI entailment >0.7), number of perturbations (5 per type), token-count control (±5%), and semantic validity audit procedures are documented with specific thresholds.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "The paper claims to release benchmarks but provides no URL or repository link where the raw data can be accessed for independent verification.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Data collection is described: 3 human annotators with consensus ≥2/3 for LLM instances; RL expert annotation with κ = 0.847 across 15 environments, 10 random seeds, and environment-stratified splits.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": true,
    279           "answer": false,
    280           "justification": "Annotators are mentioned ('3 human annotators,' '3 human raters') but no recruitment procedure, qualification criteria, or platform is described; the ethics statement mentions only informed consent and compensation.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The pipeline from LLM fine-tuning checkpoints → output sampling → perturbation generation → validity audit → detection scoring is described with algorithmic detail in Section 3 and Appendix D.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Training cutoffs for Llama-3-8B, Llama-3-70B, and GPT-4 are not stated; this matters since tasks include TL;DR summarization (Reddit data) that could overlap with pretraining corpora.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "No discussion of potential overlap between TL;DR summarization or other evaluation task data and the pretraining data of Llama-3 or GPT-4.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "LLM tasks include TL;DR summarization (Reddit-based) which could be present in Llama-3's pretraining data, affecting gaming behavior; this potential contamination is not addressed.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human subjects experiment requiring pre-registration; annotation workers are used for labeling, not as research participants in an experimental study.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "Human annotation is conducted (with informed consent and compensation noted in ethics statement), but this is annotation labor rather than a human subjects experiment requiring IRB review.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participant subjects; annotation workers are service providers and their demographics are not applicable to the study.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human subjects study with inclusion/exclusion criteria for research participants.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human subjects randomized experiment.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human subjects blinded experiment.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participant attrition applicable.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": true,
    358           "justification": "Computational overhead is reported as 2.1% for LLM and 4.2% for RL; Table 31 provides absolute GPU-hours for each mitigation technique (e.g., 0.47 GPU-hrs for combined approach on NVIDIA A6000).",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "Per-technique overhead is reported but the total compute budget for the full experimental suite (32 LLM conditions × fine-tuning runs, 15 RL environments × 10 seeds) is not stated.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "EST achieves 78.4% precision and 81.7% recall for RL reward hacking detection across 15 environments and 5 algorithms",
    373       "evidence": "Table 11 reports these figures on 2,156 expert-annotated episodes with 5-fold cross-validation and 95% confidence intervals (0.784±0.027, 0.817±0.023); Cohen's κ = 0.847 inter-rater agreement",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "EST achieves 74.2% precision and 78.6% recall for LLM evaluator gaming detection across 32 experimental conditions",
    378       "evidence": "Tables 1 and 7 report these figures on 1,200 human-annotated instances with Fleiss' κ ≥ 0.78 inter-rater agreement across 4 tasks",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Closed-loop EST-triggered mitigation improves human win-rate by 8.3 points (52.1% → 60.4%) for LLM fine-tuning",
    383       "evidence": "Table 9 shows win-rate improvement; Table 30 control experiments show extra compute alone yields only +2.1%, ruling out compute as confound",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Closed-loop mitigation reduces RL reward hacking by 54.6% with 9.1% performance impact",
    388       "evidence": "Table 31 reports combined approach achieves 54.6% hacking reduction; Pareto frontier analysis in Figure 10 shows this is the best available trade-off",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Proxy-true correlation tracking transfers directly between RL and LLM domains without modification",
    393       "evidence": "Table 8 defines direct transfer as ≥90% in-domain performance; correlation tracking achieves AUC 0.821 (RL) and 0.798 (LLM) without modification",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "EST provides early warning with median lead time of 3 checkpoints before human-noticeable quality decline",
    398       "evidence": "Figure 2 and Table 7 report 3.0±0.4 checkpoint lead time, defined as checkpoints between detection trigger and human win-rate dropping below 0.50",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "EST outperforms all baselines including reward model ensemble disagreement (F1 0.734 vs 0.687)",
    403       "evidence": "Tables 3 and 16 compare against 9+ baselines; EST achieves F1 0.734 vs. next-best standalone of 0.694 (correlation tracking) and 0.700 (hardened judge)",
    404       "supported": "strong"
    405     }
    406   ],
    407   "methodology_tags": [
    408     "benchmark-eval",
    409     "empirical"
    410   ],
    411   "key_findings": "EST detects proxy gaming through invariance-based stress tests that separate exploitable format sensitivity from content-driven improvements, achieving 78.4%/81.7% precision/recall for RL hacking and 74.2%/78.6% for LLM evaluator gaming across 32 experimental conditions. Closed-loop mitigation triggered by EST improves human win-rates by 8.3 points and reduces RL hacking by 54.6%, with control experiments ruling out compute as the explanation. Cross-domain analysis reveals that correlation tracking and ensemble voting transfer directly between RL and LLM domains while perturbation design requires adaptation. The framework operates online with low overhead (2.1% LLM, 4.2% RL) and provides 3-checkpoint early warning before human-noticeable quality decline, enabling proactive intervention during fine-tuning.",
    412   "red_flags": [
    413     {
    414       "flag": "No code or data URL provided",
    415       "detail": "Despite claiming to release benchmarks (2,156 RL episodes, 1,200 LLM instances), no repository URL, DOI, or access location is provided anywhere in the paper."
    416     },
    417     {
    418       "flag": "GPT-4 version unspecified",
    419       "detail": "GPT-4 is used as a judge evaluator throughout without specifying version (gpt-4-0314, gpt-4-turbo, etc.) or snapshot date, making exact replication impossible."
    420     },
    421     {
    422       "flag": "Circular RL ground truth for most episodes",
    423       "detail": "For 13,091 of 15,247 RL episodes (86%), ground truth is established via detector consensus (3+ of 6 detectors agree) rather than human annotation — the authors acknowledge this circularity but use these episodes for pattern analysis and prevalence claims."
    424     },
    425     {
    426       "flag": "Custom environments maximize contrast",
    427       "detail": "The paper acknowledges 'unusually large effect sizes reflect our custom environments designed to maximize experimental contrast'; real-world effect sizes are acknowledged to be smaller (Cohen's d 0.8-1.2 vs. reported 1.24-2.08)."
    428     },
    429     {
    430       "flag": "Self-citation for key supporting claims",
    431       "detail": "Two key supporting citations (Shihab et al., 2025a on entropy regularization and Shihab et al., 2025b on reward function structure) are by the same authors and are arXiv preprints, not peer-reviewed work."
    432     },
    433     {
    434       "flag": "No prompts provided for LLM experiments",
    435       "detail": "The paper does not provide actual prompts used for LLM fine-tuning, judge evaluation rubrics, or the perturbation generation process, significantly limiting reproducibility."
    436     }
    437   ],
    438   "cited_papers": [
    439     {
    440       "title": "Defining and Characterizing Reward Hacking (Skalse et al., 2022)",
    441       "relevance": "Provides the formal definition of proxy gaming used throughout the paper and theoretical grounding for EST's unhackability criterion"
    442     },
    443     {
    444       "title": "Training Language Models to Follow Instructions with Human Feedback (Ouyang et al., 2022)",
    445       "relevance": "Foundational RLHF work that introduces the training paradigm EST is designed to monitor for gaming"
    446     },
    447     {
    448       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model (Rafailov et al., 2023)",
    449       "relevance": "One of the two training methods (DPO) evaluated in the LLM experiments"
    450     },
    451     {
    452       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena (Zheng et al., 2023)",
    453       "relevance": "Establishes LLM-as-judge evaluation paradigm that the paper identifies as vulnerable to evaluator gaming"
    454     },
    455     {
    456       "title": "Scaling Laws for Reward Model Overoptimization (Gao et al., 2023)",
    457       "relevance": "Directly relevant to proxy-true divergence measurement; shows reward model scores diverge from human preferences with overoptimization"
    458     },
    459     {
    460       "title": "Specification Gaming: The Flip Side of AI Ingenuity (Krakovna et al., 2020)",
    461       "relevance": "Provides taxonomy of reward hacking behaviors that grounds the RL component of EST and defines specification gaming"
    462     },
    463     {
    464       "title": "Concrete Problems in AI Safety (Amodei et al., 2016)",
    465       "relevance": "Foundational safety paper establishing reward hacking as a core alignment challenge that motivates the EST framework"
    466     },
    467     {
    468       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models (Wei et al., 2022)",
    469       "relevance": "Related to the reasoning validity detector component of EST that checks for valid reasoning chains during fine-tuning"
    470     }
    471   ],
    472   "engagement_factors": {
    473     "practical_relevance": {
    474       "score": 3,
    475       "justification": "Directly applicable to anyone training LLMs with RLHF or LLM-as-judge evaluation pipelines — an increasingly important and widespread concern in AI development"
    476     },
    477     "surprise_contrarian": {
    478       "score": 1,
    479       "justification": "The existence of proxy gaming is well-established; the invariance-based unified framework across domains is novel but not surprising in its conclusions"
    480     },
    481     "fear_safety": {
    482       "score": 2,
    483       "justification": "Addresses AI alignment safety concerns about reward hacking and evaluator gaming, with concrete detection and mitigation methods for recognized risks in deployed RLHF systems"
    484     },
    485     "drama_conflict": {
    486       "score": 1,
    487       "justification": "No major controversy; the paper confirms known problems and proposes systematic solutions rather than challenging dominant narratives"
    488     },
    489     "demo_ability": {
    490       "score": 1,
    491       "justification": "Claims benchmarks are released but provides no URL; cannot easily reproduce or demo the framework without code release"
    492     },
    493     "brand_recognition": {
    494       "score": 0,
    495       "justification": "Iowa State University authors without prior high-profile publications; no famous lab or product association"
    496     }
    497   },
    498   "hn_data": {
    499     "threads": [
    500       {
    501         "hn_id": "38720557",
    502         "title": "ReLoRA: High-Rank Training Through Low-Rank Updates",
    503         "points": 3,
    504         "comments": 0,
    505         "url": "https://news.ycombinator.com/item?id=38720557"
    506       },
    507       {
    508         "hn_id": "41035192",
    509         "title": "The Limitations of Compute Thresholds as a Governance Strategy",
    510         "points": 1,
    511         "comments": 0,
    512         "url": "https://news.ycombinator.com/item?id=41035192"
    513       }
    514     ],
    515     "top_points": 3,
    516     "total_points": 4,
    517     "total_comments": 0
    518   }
    519 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs