scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26651B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Importance-Aware Data Selection for Efficient LLM Instruction Tuning",
      6     "authors": [
      7       "Tingyu Jiang",
      8       "Shen Li",
      9       "Yiyao Song",
     10       "Lan Zhang",
     11       "Hualei Zhu"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2511.07074",
     16     "doi": "10.48550/arXiv.2511.07074"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract's core claim that top 1% MIWV-selected data outperforms full-dataset training is directly supported by Table 1 and Figures 2–4, showing win rates consistently above 1.0 across multiple model and dataset combinations.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper makes causal claims that MIWV-selected data improves performance, and these are supported by ablation studies (Section 4.6) directly comparing MIWV vs. random, high-prompt-loss, and low-MIWV strategies on identical architectures.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper claims MIWV is a 'universal data selection method applicable to all LLMs' and 'can be applied to all LLMs,' but experiments cover only LLaMA-7B, LLaMA2-7B/13B, and Qwen2.5-7B/14B — a narrow slice of open-weight models with no closed-model or non-decoder testing.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not consider that high-MIWV samples may simply correlate with data complexity or diversity under alternative metrics; no alternative explanation for why ICL loss discrepancy identifies valuable data is discussed.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper claims MIWV 'enhances the model's capabilities' but measures GPT-4 pairwise win rates and benchmark scores; the distinction between GPT-4 judge preference and actual instruction-following capability is not discussed, and AlpacaEval is run on only 5% of its dataset 'due to budget constraints.'",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations or threats-to-validity section anywhere in the paper; the conclusion only presents positive framing.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats to validity are discussed: GPT-4 as judge for data generated by GPT-like systems, potential circularity, lack of error bars, or domain specificity are never raised.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper asserts universal applicability without specifying conditions under which MIWV would not apply (e.g., datasets where ICL is unreliable, very long instruction samples, or non-English settings).",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgment states: 'This work was supported by JST CREST Grant Number JPMJCR21M2, including the AIP Challenge Program.'",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly listed on the title page: Alibaba Cloud Computing, Independent Researcher, and University of Tokyo.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The funder is JST CREST, a Japanese government agency, which is independent of the outcome; however, the majority of authors are Alibaba Cloud employees with commercial interest in efficient fine-tuning — this institutional conflict is undisclosed.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) appears anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "MIWV is formally defined (Equation 8), and the paper defines prompt loss (Eq. 7) and loss without one-shot context (Eq. 5); 'instruction tuning' and 'in-context learning' are used with adequate contextual explanation for the field.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly lists three numbered contributions at the end of the Introduction: the universal data selection method, the MIWV metric, and experimental validation of superiority.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2.2 reviews eight specific competing methods (InstructMining, INSTAG, Alpagasus, QDIT, Deita, RECOST, SelectIT, DiverseEvol) and Section 4.5 directly benchmarks against them, explaining why each is weaker.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No code repository, GitHub link, or promise of public release is mentioned anywhere in the paper.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All training datasets used (Alpaca, WizardLM) and evaluation benchmarks (Vicuna, Koala, LIMA, Self-instruct, Open LLM Leaderboard datasets) are existing publicly available resources used unmodified.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Hardware is stated (PyTorch 2.0.1, A100 80GB, Xeon 8369B) and basic training parameters are in Appendix A, but no requirements.txt, Dockerfile, or complete dependency list is provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The paper references the 'Alpaca codebase' and lists hyperparameters in Appendix A, but provides no step-by-step instructions for running the MIWV selection pipeline or reproducing results from scratch.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Experiments are repeated three times and arithmetic means are reported, but no confidence intervals, error bars, or standard deviations appear in any result table or figure.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied to any of the comparative win rate claims; numerical differences are presented as evidence without p-values or confidence bounds.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Win rates (e.g., 1.127 for 1% vs. 1.000 baseline) and absolute benchmark score differences (e.g., ARC 57.25 vs. 54.35) provide interpretable relative magnitude of effects.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The number of test examples and training subsets are stated but not justified; no power analysis or discussion of whether the test sets are large enough to detect the observed differences is provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "The paper states 'all experiments are repeated three times with arithmetic mean results reported' but never reports standard deviation, variance, or spread across the three runs.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "The full-dataset-trained model serves as the primary baseline, and Table 2 compares against eight competing methods: IFD Score, SelectIT, Superfiltering, Alpagasus, Deita, DiverseEvol, Nuggets, and RECOST.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "All competing methods are from 2023–2024 (e.g., Superfiltering ACL 2024, SelectIT 2024, RECOST 2024), and MIWV outperforms all of them on win rate.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Section 4.6 conducts ablations on data selection strategy (random, high prompt loss, low MIWV vs. MIWV) and embedding model choice (bge-en-large, multilingual-e5-large, gte-base-en-v1.5).",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Three distinct evaluation frameworks are used: GPT-4 pairwise win rate, Huggingface Open LLM Leaderboard (ARC, HellaSwag, MMLU, TruthfulQA), and AlpacaEval.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "No human evaluation is conducted; GPT-4 is used as an automated judge as a substitute for human raters, with no validation of judge reliability reported.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Five distinct test datasets (Vicuna, Koala, WizardLM test set, Self-instruct, LIMA) totaling 1,030 instruction samples are held out from training data.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Table 1 breaks down Open LLM Leaderboard results by individual benchmark (ARC, HellaSwag, MMLU, TruthfulQA); Figure 2–13 break results down by individual test set.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "Section 4.8 presents only a success case study showing the 1% model answering a math question correctly; no failure cases or conditions where MIWV underperforms are examined.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The ablation (Figure 3a) explicitly shows that random selection, high-prompt-loss selection, and low-MIWV selection all produce models that underperform the full-dataset baseline.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "LLaMA-7B, LLaMA2-7B, LLaMA2-13B, Qwen2.5-7B, and Qwen2.5-14B are specific named versions; while checkpoint dates are absent, these are widely-known standard releases with sufficient specificity for reproducibility.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Appendix F (Table 5) provides the full system prompt and user prompt template used for GPT-4 evaluation; instruction tuning uses the standard Alpaca prompt format (referenced).",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Appendix A reports: Adam optimizer, learning rate 2×10⁻⁵, batch size 128, 3 epochs, and maximum input lengths (512/1024/2048) per model and dataset combination.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "This is a supervised fine-tuning paper with no agentic scaffolding; the question is not applicable.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Appendix A documents filtering of 'AI censure' samples from WizardLM and the embedding-based one-shot retrieval pipeline using bge-en-large with mean pooling and cosine similarity is fully described in Section 3.1.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "Alpaca and WizardLM datasets are publicly available standard resources; benchmark test sets (Vicuna, Koala, LIMA, etc.) are also publicly available.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Appendix A describes dataset construction: Alpaca uses self-instruction via text-DaVinci-003 (52,002 samples); WizardLM uses Evol-Instruct (63,655 samples); both are well-documented in their original papers.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; only standard public benchmark datasets are used.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Sections 3.1–3.3 document the full pipeline: embedding computation → cosine-similarity one-shot retrieval → MIWV loss computation → MIWV ranking → subset selection → instruction tuning.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "The pretraining data cutoffs for LLaMA, LLaMA2, and Qwen2.5 are never stated; the paper does not discuss what data these base models were trained on.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of whether test sets (Vicuna, Koala, LIMA, etc.) may have been in the pretraining data of LLaMA or LLaMA2; contamination is entirely unaddressed.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "Open LLM Leaderboard benchmarks (ARC, HellaSwag, MMLU, TruthfulQA) were publicly available long before LLaMA/LLaMA2 pretraining cutoffs; no contamination assessment is performed.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants involved.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants involved.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants involved.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants involved.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants involved.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants involved.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants involved.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Table 2 reports that MIWV selection takes 85 minutes compared to 8–300 minutes for competing methods, making it the second-fastest method evaluated.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Hardware specifications are given (A100 80GB GPUs) but total GPU-hours for training, or the computational budget for running MIWV across the full datasets, are not reported.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Selecting the top 1% of data by MIWV produces a model that outperforms one trained on the full dataset across multiple evaluation benchmarks.",
    375       "evidence": "Table 1 shows pairwise win rates of 1.063–1.127 for LLaMA2-7B/13B on Alpaca 1% vs. 100%; consistent improvement also on Open LLM Leaderboard average scores.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "MIWV is a universal metric applicable to all LLMs and both Alpaca-style and WizardLM-style instruction datasets.",
    380       "evidence": "Tested on LLaMA-7B, LLaMA2-7B/13B, and Qwen2.5-7B/14B on two datasets; results are consistently positive but the model family coverage is narrow and all are open-weight decoder-only models.",
    381       "supported": "weak"
    382     },
    383     {
    384       "claim": "MIWV outperforms eight competing data selection methods in win rate on the WizardLM test set.",
    385       "evidence": "Table 2 shows MIWV achieving the highest win rates at 1%, 5%, 10%, and 15% data fractions compared to IFD Score, SelectIT, Superfiltering, Alpagasus, Deita, DiverseEvol, Nuggets, and RECOST.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Higher MIWV samples exhibit more uniform distribution across the instruction space, correlating with data diversity.",
    390       "evidence": "t-SNE visualization (Figure 6) shows top-5% MIWV samples spread across the embedding space while bottom-5% cluster; this is a qualitative, informal argument not supported by a quantitative diversity metric.",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "ICL-guided IFD score selection outperforms the original cluster-based IFD Score method.",
    395       "evidence": "Table 3 shows ICL+IFD achieves overall win rate of 1.017 vs. IFD Score's 0.939 on five test sets using GPT-4 judgment on the 1% subset.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "Increasing training data proportion beyond an optimal point degrades win rate due to noise and data interference.",
    400       "evidence": "Figures 2–4 show win rates declining as data proportion increases beyond 10–20%, interpreted as evidence of noise; however, the models still consistently outperform the 100% baseline, and no noise analysis is performed.",
    401       "supported": "weak"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "observational"
    407   ],
    408   "key_findings": "The paper proposes Model Instruction Weakness Value (MIWV), computed as the difference in model loss with vs. without a one-shot ICL example, to identify instruction samples that expose model weaknesses and are therefore most valuable for fine-tuning. Training LLaMA2-7B/13B on only the top 1% of MIWV-ranked Alpaca data (520 samples) consistently outperforms training on the full 52,002-sample dataset across GPT-4 pairwise evaluation and Open LLM Leaderboard benchmarks. MIWV achieves the best win rates among nine competing data selection methods while requiring only 85 minutes for selection — second-fastest after Superfiltering. Results generalize to the WizardLM dataset and Qwen2.5 model families, supporting (though not proving) the method's broader applicability.",
    409   "red_flags": [
    410     {
    411       "flag": "GPT-4 judge circularity",
    412       "detail": "Alpaca and WizardLM datasets are generated by GPT-like APIs, yet GPT-4 is used as the primary quality judge for pairwise comparisons. This creates potential circularity: models fine-tuned to mimic GPT-style outputs will be favored by a GPT-4 judge independent of actual capability improvement."
    413     },
    414     {
    415       "flag": "No variance across repeated runs",
    416       "detail": "Experiments are run three times and means reported, but no standard deviations are provided; statistical significance of win rate differences is never established, making it impossible to assess whether observed advantages are reliable."
    417     },
    418     {
    419       "flag": "Universality overclaim",
    420       "detail": "The method is claimed to be 'applicable to all LLMs' and 'universal' but is tested only on four open-weight decoder-only models from two families (LLaMA/LLaMA2 and Qwen2.5); no instruction-tuned baselines, non-English models, or closed models are tested."
    421     },
    422     {
    423       "flag": "AlpacaEval on 5% subsample",
    424       "detail": "AlpacaEval is run on only 5% of the dataset 'due to budget constraints,' severely limiting statistical reliability of this metric and making comparisons unreliable."
    425     },
    426     {
    427       "flag": "No limitations section",
    428       "detail": "The paper contains no limitations or threats-to-validity section; conditions under which MIWV fails, edge cases in the one-shot retrieval (irrelevant nearest neighbors), or domain specificity are never discussed."
    429     },
    430     {
    431       "flag": "Benchmark contamination unaddressed",
    432       "detail": "Open LLM Leaderboard benchmarks (ARC, HellaSwag, MMLU, TruthfulQA) predate LLaMA/LLaMA2 pretraining; no contamination analysis is performed despite these benchmarks being primary evaluation metrics."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "From Quantity to Quality: Boosting LLM Performance with Self-Guided Data Selection for Instruction Tuning (IFD Score)",
    438       "relevance": "Primary competing method; MIWV is directly compared against and outperforms IFD Score in ablation and main comparison experiments"
    439     },
    440     {
    441       "title": "LIMA: Less Is More for Alignment",
    442       "relevance": "Closely related finding that small high-quality datasets suffice for instruction tuning; provides theoretical grounding for the paper's central premise"
    443     },
    444     {
    445       "title": "What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning (DEITA)",
    446       "relevance": "Competing method that controls both quality and diversity; directly benchmarked against MIWV in Table 2"
    447     },
    448     {
    449       "title": "Superfiltering: Weak-to-Strong Data Filtering for Fast Instruction-Tuning",
    450       "relevance": "Competing method that achieves similar efficiency to MIWV; the only method faster than MIWV in selection time"
    451     },
    452     {
    453       "title": "SelectIT: Selective Instruction Tuning for Large Language Models via Uncertainty-Aware Self-Reflection",
    454       "relevance": "Competing method requiring model training for selection; used in Table 2 comparison"
    455     },
    456     {
    457       "title": "RECOST: External Knowledge Guided Data-Efficient Instruction Tuning",
    458       "relevance": "Competing method using conditional entropy and external knowledge; directly compared in Table 2"
    459     },
    460     {
    461       "title": "AlpaGasus: Training a Better Alpaca with Fewer Data",
    462       "relevance": "Early work showing data quality selection over quantity for instruction tuning; uses ChatGPT for filtering, a key limitation compared by MIWV"
    463     },
    464     {
    465       "title": "Stanford Alpaca: An Instruction-Following LLaMA Model",
    466       "relevance": "Primary training dataset used throughout the paper; provides the base instruction set for MIWV evaluation"
    467     }
    468   ],
    469   "engagement_factors": {
    470     "practical_relevance": {
    471       "score": 3,
    472       "justification": "The method claims 99% training cost reduction (1% data) with improved performance — immediately actionable for any practitioner doing LLM fine-tuning with compute constraints."
    473     },
    474     "surprise_contrarian": {
    475       "score": 2,
    476       "justification": "The finding that 1% of data beats 100% is counterintuitive and challenges the conventional 'more data is better' assumption in fine-tuning practice."
    477     },
    478     "fear_safety": {
    479       "score": 0,
    480       "justification": "The paper raises no AI safety concerns; it is purely a training efficiency paper."
    481     },
    482     "drama_conflict": {
    483       "score": 0,
    484       "justification": "No controversy, retraction risk, or interpersonal conflict angle is present."
    485     },
    486     "demo_ability": {
    487       "score": 2,
    488       "justification": "The concept is straightforward to demonstrate with LLaMA and public datasets, though no code is released to enable immediate replication."
    489     },
    490     "brand_recognition": {
    491       "score": 1,
    492       "justification": "Alibaba Cloud and University of Tokyo are known institutions but not top-tier AI labs; the paper lacks association with a flagship product or widely-known research group."
    493     }
    494   },
    495   "hn_data": {
    496     "threads": [
    497       {
    498         "hn_id": "10641304",
    499         "title": "SceneNet: Understanding Real World Scenes with Synthetic Data",
    500         "points": 5,
    501         "comments": 1,
    502         "url": "https://news.ycombinator.com/item?id=10641304",
    503         "created_at": "2015-11-28T15:35:43Z"
    504       }
    505     ],
    506     "top_points": 5,
    507     "total_points": 5,
    508     "total_comments": 1
    509   }
    510 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs