scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (30261B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Is In-Context Learning Learning?",
      6     "authors": [
      7       "Adrian de Wynter"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2509.10414",
     12     "doi": "10.48550/arXiv.2509.10414"
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "All three core abstract claims — ICL fits PAC learning definition mathematically, accuracy is insensitive to model/prompt/distribution in the limit, and CoT/APO show distributional sensitivity — are directly supported by Tables 1–2 and ablation results.",
     20         "source": "haiku"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper uses controlled synthetic data generation with explicit distributional shift parameters (δ), five prompting strategies, and ablations on lexical features, positionality, and distribution to isolate causal mechanisms; this design supports causal inference about ICL's limitations.",
     26         "source": "haiku"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The paper explicitly bounds its claims: 'constrained to non-natural language tasks,' 'constrained to easily-verifiable tasks in a single call,' and 'Our results are limited to the ability of ICL to draw conclusions from the data's features alone.' Appendix D elaborates these scope boundaries.",
     32         "source": "haiku"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Section 7 explicitly discusses two alternative explanations — contamination and tokenization — and argues why they cannot fully account for the observed results (e.g., good performance on Hamiltonian and PARITY suggests FSA simulation, not memorization).",
     38         "source": "haiku"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Section 6.4 explicitly distinguishes 'compliance' (parsing errors counted as failures) from 'learning' (correctly labeled instances), showing this distinction changes slopes and magnitude of performance estimates and matters for interpretation.",
     44         "source": "haiku"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Appendix A is explicitly titled 'LIMITATIONS' and discusses reproducibility challenges from LLM updates, prohibitive cost of full-suite re-runs, exclusion of reasoning models and multi-call paradigms, and nuances of ML baseline interpretation.",
     52         "source": "haiku"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Specific threats are named: 'LLMs are continuously updated' (reproducibility), 'running synchronously a single task per LLM could and has taken months' (replication cost), and 'interpreting the results from the ML baselines is nuanced' due to input representation length sensitivity.",
     58         "source": "haiku"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Explicit scope exclusions include: reasoning models like o3-mini, multi-step/multi-call strategies such as Tree-of-Thoughts, natural-language datapoints (only instructions are in natural language), and tasks with unbounded states beyond the vending machine sum.",
     64         "source": "haiku"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No funding source is disclosed anywhere in the paper; there is no acknowledgments or funding section, despite the author's Microsoft affiliation implying institutional support.",
     72         "source": "haiku"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Author affiliation is clearly stated in the header: 'Microsoft and the University of York.'",
     78         "source": "haiku"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The sole author is a Microsoft employee, and three of the four evaluated LLMs are Microsoft products (GPT-4 Turbo, GPT-4o, Phi-3.5 MoE Instruct); this conflict is not disclosed or addressed.",
     84         "source": "haiku"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests statement, patent disclosure, or financial interests declaration appears in the paper.",
     90         "source": "haiku"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Core terms are formally defined: 'learning' via PAC framework (Section 3.1), 'ICL' via formal probability equations (Section 3.3), 'OOD' via distribution distance parameter δ, and 'prompting strategies' (n-Shot, Description, APO, DE, CoT, Word Salad) with explicit descriptions.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper explicitly states it provides: (1) a theoretical framing of ICL within PAC learning, and (2) 'one of the largest' large-scale empirical analyses ablating memorization, pretraining, distributional shift, and prompting style across 4 LLMs and 9 tasks.",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 2 is a substantive related-work section that situates the paper relative to theoretical limitations work (Hahn & Rofin 2024, Strobl et al. 2024), empirical ICL studies (Sclar et al. 2024, Agarwal et al. 2024), and explicitly identifies gaps this work addresses.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "empirical": {
    116       "artifacts": {
    117         "code_released": {
    118           "applies": true,
    119           "answer": true,
    120           "justification": "Code is available at https://github.com/adewynter/is-icl-learning, referenced repeatedly throughout the paper including in the Reproducibility Statement: 'All code is included in the repository. It will be open-sourced under the MIT licence.'",
    121           "source": "haiku"
    122         },
    123         "data_released": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "All datasets are synthetic and generated by code released in the repository; the generation procedure (automata, transition probabilities, δ parameterization) is fully described and reproducible from the released code.",
    127           "source": "haiku"
    128         },
    129         "environment_specified": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "Hardware is described (Azure Standard ND40rs v2, 8× NVIDIA V100 32GB), and scikit-learn is named for baselines, but no requirements.txt, Dockerfile, or dependency specifications are provided.",
    133           "source": "haiku"
    134         },
    135         "reproduction_instructions": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "The paper refers readers to the repository and provides model parameters in Appendix F, but no step-by-step reproduction instructions are given; Appendix A explicitly warns against re-running the full suite due to cost.",
    139           "source": "haiku"
    140         }
    141       },
    142       "statistical_methodology": {
    143         "confidence_intervals_or_error_bars": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "Standard deviation (σ) is reported throughout alongside mean accuracy in all tables (e.g., '94±1%', '80±3', Table 1) and for OLS slopes.",
    147           "source": "haiku"
    148         },
    149         "significance_tests": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "No formal statistical significance tests (t-tests, ANOVA, etc.) are applied to comparative claims; OLS is used only to measure slopes/trends, not to test significance of differences between conditions.",
    153           "source": "haiku"
    154         },
    155         "effect_sizes_reported": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "Accuracy differences are reported with magnitude (e.g., '31% accuracy gap' between closely-related tasks), and OLS slopes (e.g., CoT OOD slope -1.4 vs modus ponens -0.4) provide effect-size-like quantification.",
    159           "source": "haiku"
    160         },
    161         "sample_size_justified": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "Test set size of 1,000 (out of 2,000) is explained by cost constraints ('due to cost we only evaluated 1000'), not by formal power analysis or statistical justification.",
    165           "source": "haiku"
    166         },
    167         "variance_reported": {
    168           "applies": true,
    169           "answer": true,
    170           "justification": "Standard deviation is consistently reported in all results tables alongside every accuracy figure.",
    171           "source": "haiku"
    172         }
    173       },
    174       "evaluation_design": {
    175         "baselines_included": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Decision trees, k-nearest neighbours, and MLPs are tested as baselines; the best-performing is reported against LLM performance for each task.",
    179           "source": "haiku"
    180         },
    181         "baselines_contemporary": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "The LLMs tested (GPT-4 Turbo, GPT-4o, Mixtral 8x7B, Phi-3.5 MoE) are contemporary models; the traditional ML baselines are intentionally classical to test whether LLMs provide learning advantages over them.",
    185           "source": "haiku"
    186         },
    187         "ablation_study": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Four substantive ablation studies are conducted: (1) lexical features / word salad, (2) exemplar positionality, (3) alternate training distributions, and (4) compliance vs. learning separation.",
    191           "source": "haiku"
    192         },
    193         "multiple_metrics": {
    194           "applies": true,
    195           "answer": false,
    196           "justification": "Accuracy is the sole evaluation metric throughout; OLS slopes and σ are statistical treatments of accuracy, not independent metrics measuring different aspects of performance.",
    197           "source": "haiku"
    198         },
    199         "human_evaluation": {
    200           "applies": false,
    201           "answer": false,
    202           "justification": "The paper evaluates LLMs on formal language/synthetic tasks with ground-truth labels; human evaluation is clearly irrelevant.",
    203           "source": "haiku"
    204         },
    205         "held_out_test_set": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "Five separate test sets are generated per task for each δ ∈ {0, 0.2, 0.45, 0.65, 0.85}; training data is only used for APO prompt optimization and ML baselines.",
    209           "source": "haiku"
    210         },
    211         "per_category_breakdown": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Results are extensively broken down per task (9 tasks in Table 1), per LLM (4 models), per prompting strategy (7 strategies in Table 2), and per OOD level (5 δ values).",
    215           "source": "haiku"
    216         },
    217         "failure_cases_discussed": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Vending Machine (Sum) is analyzed as a complete failure case (near-zero slope regardless of shots); SoT's near-zero accuracy is discussed; and OOD brittleness particularly in CoT is analyzed in depth.",
    221           "source": "haiku"
    222         },
    223         "negative_results_reported": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "The paper's primary contribution is its negative results: ICL fails to generalize robustly to OOD, CoT is brittle, and traditional ML baselines outperform average ICL accuracy in half of evaluated tasks.",
    227           "source": "haiku"
    228         }
    229       },
    230       "setup_transparency": {
    231         "model_versions_specified": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "Appendix F, Table 4 specifies versions: GPT-4-0125 for GPT-4 Turbo, GPT-4o, Phi-3.5-MoE-Instruct, and Mixtral-8x7B instruct v01; temperature=0 and max tokens are also specified.",
    235           "source": "haiku"
    236         },
    237         "prompts_provided": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Appendix H provides five fully worked-out sample prompts (modus ponens, description, word salad, CoT, DE, SoT) with actual content for PARITY, Pattern Matching, Reversal, Maze Complete, and Vending Machine.",
    241           "source": "haiku"
    242         },
    243         "hyperparameters_reported": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "Temperature=0, max_tokens=3 (standard) / 1024 (CoT) / 512 (APO) are reported; APO parameters (batch size 1024, beam width 4, search depth 6) are also specified.",
    247           "source": "haiku"
    248         },
    249         "scaffolding_described": {
    250           "applies": false,
    251           "answer": false,
    252           "justification": "All evaluations are single-call LLM invocations; there is no agentic scaffolding.",
    253           "source": "haiku"
    254         },
    255         "data_preprocessing_documented": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "Section 4.5 documents the full data generation pipeline: automata design, transition probabilities, five δ levels, 2000 training / 1000 test entries per condition, η=0.05 mislabeling for memorization control, and deduplication.",
    259           "source": "haiku"
    260         }
    261       },
    262       "data_integrity": {
    263         "raw_data_available": {
    264           "applies": true,
    265           "answer": true,
    266           "justification": "The GitHub repository contains both code and data; since data is synthetic, the released generation code serves as the authoritative source for raw data.",
    267           "source": "haiku"
    268         },
    269         "data_collection_described": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "Section 4.5 and Appendix E describe the full synthetic data generation procedure per task, including automaton structure, alphabet, transition probabilities, and OOD characterization.",
    273           "source": "haiku"
    274         },
    275         "recruitment_methods_described": {
    276           "applies": false,
    277           "answer": false,
    278           "justification": "No human participants; data is entirely synthetic.",
    279           "source": "haiku"
    280         },
    281         "data_pipeline_documented": {
    282           "applies": true,
    283           "answer": true,
    284           "justification": "The full pipeline from automaton specification to test set generation (including deduplication, mislabeling, balancing) is documented in Section 4.5 and Appendix E, with code in the repository.",
    285           "source": "haiku"
    286         }
    287       },
    288       "contamination": {
    289         "training_cutoff_stated": {
    290           "applies": true,
    291           "answer": false,
    292           "justification": "Model versions are given (GPT-4-0125, Phi-3.5-MoE-Instruct, Mixtral-8x7B instruct v01) but training data cutoffs are not stated for any model.",
    293           "source": "haiku"
    294         },
    295         "train_test_overlap_discussed": {
    296           "applies": true,
    297           "answer": true,
    298           "justification": "Section 7 explicitly discusses contamination as an alternative explanation for high accuracy on certain tasks (Pattern Matching), and the η=0.05 mislabeling scheme is introduced specifically to 'account for any potential memorisation.'",
    299           "source": "haiku"
    300         },
    301         "benchmark_contamination_addressed": {
    302           "applies": true,
    303           "answer": true,
    304           "justification": "The paper uses artificial alphabets (e.g., Reversal uses symbols like '¯\\_(ツ)_/¯') specifically to force learning from context rather than memorized patterns, and explicitly tests a randomly initialized model as contamination-free baseline (Appendix D.2).",
    305           "source": "haiku"
    306         }
    307       },
    308       "human_studies": {
    309         "pre_registered": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "No human participants.",
    313           "source": "haiku"
    314         },
    315         "irb_or_ethics_approval": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants.",
    319           "source": "haiku"
    320         },
    321         "demographics_reported": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants.",
    325           "source": "haiku"
    326         },
    327         "inclusion_exclusion_criteria": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants.",
    331           "source": "haiku"
    332         },
    333         "randomization_described": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants.",
    337           "source": "haiku"
    338         },
    339         "blinding_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants.",
    343           "source": "haiku"
    344         },
    345         "attrition_reported": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants.",
    349           "source": "haiku"
    350         }
    351       },
    352       "cost_and_practicality": {
    353         "inference_cost_reported": {
    354           "applies": true,
    355           "answer": false,
    356           "justification": "Inference cost is described only qualitatively ('could and has taken months,' 'aggregate cost could render further exploration prohibitive') with no dollar figures or token counts provided.",
    357           "source": "haiku"
    358         },
    359         "compute_budget_stated": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "Hardware (Azure Standard ND40rs v2, 8× V100 32GB) is mentioned but total compute hours or budget is not stated; the paper explicitly discourages others from re-running the full suite.",
    363           "source": "haiku"
    364         }
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "ICL mathematically fits the PAC learning framework because next-token prediction can satisfy PAC learning's generalization requirements.",
    371       "evidence": "Formal derivation in Section 3.3 reframes ICL as a PAC learner f: {p} × X → {0,1} satisfying Equations 3–5.",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "In the limit (50–100 exemplars), accuracy gaps between LLMs and prompting strategies narrow and all prompts show positive improvement slopes.",
    376       "evidence": "Table 2 OLS slopes: all prompting strategies have positive shot slopes (modus ponens 8.3, description 4.4, CoT 3.3); Table 1 shows peak averages at 50–100 shots.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "ICL is brittle to out-of-distribution inputs, especially chain-of-thought and APO prompting.",
    381       "evidence": "Table 2 δ slopes: CoT has the largest negative OOD slope (-1.4), APO -0.5; Figure 2 shows converging OOD brittleness particularly for CoT.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Semantically nonsensical prompts (word salad) reach near-equivalent accuracy to semantic prompts in the limit.",
    386       "evidence": "Table 6 shows word salad matching non-salad best-of performance within σ for PARITY, Pattern Matching, Maze Complete, and Vending Machine Verification; average slope 11±4.6 vs description 4.4±2.2.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Closely related formal tasks have accuracy gaps of up to 31%, indicating ICL generalizes per task, not per task class.",
    391       "evidence": "Table 1: Pattern Matching achieves 94±1% while Maze Solve achieves 63±5% — both FSA tasks. Reversal (61%) vs Stack (73%) differ by 12% despite both being PDA tasks.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "Traditional ML baselines (DT, kNN, MLP) outperform ICL on average best performance in half of evaluated tasks.",
    396       "evidence": "Table 1 footnotes: kNN outperforms LLM averages in Reversal, Stack, and Pattern Matching on average (not best-of); LLMs only outperform in best-of scenarios.",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "ICL learns the observed distribution P rather than generalizing to unseen Q — fully randomizing exemplars decreases accuracy.",
    401       "evidence": "Section 6.2 reports fully randomized exemplars yield lower average accuracy (43%) vs shuffled and unshuffled (48% each) for GPT-4o; discussed in Section 7 as evidence of distributional overfitting.",
    402       "supported": "moderate"
    403     }
    404   ],
    405   "methodology_tags": [
    406     "benchmark-eval",
    407     "theoretical",
    408     "observational"
    409   ],
    410   "key_findings": "While ICL formally satisfies the PAC learning definition (any next-token prediction scheme can be cast as a PAC learner), it is empirically a weak learning paradigm: accuracy converges across models and prompting styles only at 50–100 exemplars (far more than commonly claimed), and even semantically nonsensical prompts reach near-equivalent performance in the limit, suggesting LLMs exploit statistical regularities in the prompt rather than task semantics. ICL is robust to in-distribution exemplar ordering and label proportions but brittle to out-of-distribution inputs, with CoT and APO showing the largest OOD degradation. Formally similar tasks (both FSA-recognizable) show accuracy differences up to 31%, indicating that ICL's generalizability is task-specific and that autoregressive encoding is not a robust cross-task learning mechanism.",
    411   "red_flags": [
    412     {
    413       "flag": "Undisclosed conflict of interest",
    414       "detail": "The sole author is a Microsoft employee; three of four evaluated LLMs are Microsoft products (GPT-4 Turbo, GPT-4o, Phi-3.5 MoE Instruct). This potential bias is not disclosed or discussed."
    415     },
    416     {
    417       "flag": "Single metric only",
    418       "detail": "Accuracy is the sole evaluation metric throughout all 1.89M predictions; no alternative measures (e.g., calibration, F1 for imbalanced conditions) are used, limiting interpretive richness."
    419     },
    420     {
    421       "flag": "No significance testing",
    422       "detail": "OLS slopes and standard deviations are reported, but no formal hypothesis tests (t-tests, ANOVA) are applied to comparative claims between prompting strategies or models."
    423     },
    424     {
    425       "flag": "Identical version strings for distinct models",
    426       "detail": "Table 4 lists both GPT-4 Turbo and GPT-4o as 'Version: GPT-4-0125,' which appears to be a copy-paste error and undermines the model versioning claim."
    427     },
    428     {
    429       "flag": "Synthetic-only data limits applicability",
    430       "detail": "All tasks use artificial alphabets and formal languages explicitly designed to prevent semantic priors; the paper acknowledges but cannot resolve whether findings transfer to natural-language ICL scenarios."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "A Theory of the Learnable (PAC learning)",
    436       "relevance": "Foundational framework the paper uses to formally define and evaluate whether ICL constitutes learning."
    437     },
    438     {
    439       "title": "Language Models are Few-Shot Learners (GPT-3)",
    440       "relevance": "Original paper claiming few-shot ICL capability; this work's findings directly challenge its characterization of ICL generalization."
    441     },
    442     {
    443       "title": "What Languages are Easy to Language-Model? (Borenstein et al. 2024)",
    444       "relevance": "Motivating reference calling for empirical evaluation of LLM effective capabilities, directly cited as core motivation."
    445     },
    446     {
    447       "title": "Quantifying Language Models' Sensitivity to Spurious Features in Prompt Design (Sclar et al. 2024)",
    448       "relevance": "Prior work on prompt sensitivity that this paper's results both confirm (sensitivity) and partially refute (ordering sensitivity disappears in the limit)."
    449     },
    450     {
    451       "title": "Faith and Fate: Limits of Transformers on Compositionality (Dziri et al. 2023)",
    452       "relevance": "Prior empirical work finding LLM weaknesses in compositional generalization; this paper's findings on task-type inconsistency build on these results."
    453     },
    454     {
    455       "title": "Many-Shot In-Context Learning (Agarwal et al. 2024)",
    456       "relevance": "Shows that expanding shots improves performance; independently confirmed and extended by this paper across more tasks, models, and distribution shifts."
    457     },
    458     {
    459       "title": "Neural Networks and the Chomsky Hierarchy (Deletang et al. 2023)",
    460       "relevance": "Directly related work evaluating LLMs on formal language tasks; this paper uses similar task classes (FSA, PDA) with different methodology."
    461     },
    462     {
    463       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models (Wei et al. 2022)",
    464       "relevance": "Foundational CoT paper; this work tests CoT extensively and finds it improves accuracy but is the most brittle to OOD shifts."
    465     },
    466     {
    467       "title": "Awes, Laws, and Flaws from Today's LLM Research (de Wynter 2025)",
    468       "relevance": "Same author's prior methodological critique of LLM research practices that motivates this work's careful ablation design."
    469     },
    470     {
    471       "title": "The Expressive Power of Transformers with Chain of Thought (Merrill & Sabharwal 2024)",
    472       "relevance": "Theoretical work on CoT and TC0 circuits that this paper empirically tests and partially confirms regarding CoT's task-specific effectiveness."
    473     }
    474   ],
    475   "engagement_factors": {
    476     "practical_relevance": {
    477       "score": 2,
    478       "justification": "Directly challenges practitioner assumptions about few-shot sufficiency and CoT reliability, but the synthetic task setup limits immediate deployment relevance."
    479     },
    480     "surprise_contrarian": {
    481       "score": 3,
    482       "justification": "Directly contradicts the widely-held belief that a few exemplars suffice for ICL (finding 50–100 are needed) and that CoT is robustly beneficial (showing it is the most OOD-brittle strategy)."
    483     },
    484     "fear_safety": {
    485       "score": 0,
    486       "justification": "No AI safety or risk concerns raised; the paper focuses on learning theory and evaluation methodology."
    487     },
    488     "drama_conflict": {
    489       "score": 2,
    490       "justification": "Challenges widespread capability claims about LLMs in high-profile venues; the negative finding that 'ICL is not robust learning' is likely to generate debate."
    491     },
    492     "demo_ability": {
    493       "score": 1,
    494       "justification": "Code is released and could be run, but Appendix A explicitly discourages re-running the full suite due to prohibitive cost; partial demos are feasible."
    495     },
    496     "brand_recognition": {
    497       "score": 2,
    498       "justification": "Microsoft affiliation and evaluation of flagship GPT-4 models provide brand recognition, though this is a single-author paper from a researcher, not a major lab announcement."
    499     }
    500   },
    501   "hn_data": {
    502     "threads": [
    503       {
    504         "hn_id": "42807387",
    505         "title": "A Faster Quantum Fourier Transform",
    506         "points": 89,
    507         "comments": 6,
    508         "url": "https://news.ycombinator.com/item?id=42807387",
    509         "created_at": "2025-01-23T19:49:59Z"
    510       },
    511       {
    512         "hn_id": "43496244",
    513         "title": "Parameter-free KV cache compression for memory-efficient long-context LLMs",
    514         "points": 65,
    515         "comments": 19,
    516         "url": "https://news.ycombinator.com/item?id=43496244",
    517         "created_at": "2025-03-27T18:07:41Z"
    518       },
    519       {
    520         "hn_id": "43695562",
    521         "title": "M1: Towards Scalable Test-Time Compute with Mamba Reasoning Models",
    522         "points": 33,
    523         "comments": 3,
    524         "url": "https://news.ycombinator.com/item?id=43695562",
    525         "created_at": "2025-04-15T17:00:18Z"
    526       },
    527       {
    528         "hn_id": "44024987",
    529         "title": "Can You Trust Code Copilots? Evaluating LLMs from a Code Security Perspec",
    530         "points": 11,
    531         "comments": 2,
    532         "url": "https://news.ycombinator.com/item?id=44024987",
    533         "created_at": "2025-05-18T23:09:48Z"
    534       },
    535       {
    536         "hn_id": "43116772",
    537         "title": "AI Alignment at Your Discretion",
    538         "points": 3,
    539         "comments": 0,
    540         "url": "https://news.ycombinator.com/item?id=43116772",
    541         "created_at": "2025-02-20T16:33:53Z"
    542       },
    543       {
    544         "hn_id": "45284415",
    545         "title": "Is In-Context Learning Learning?",
    546         "points": 2,
    547         "comments": 0,
    548         "url": "https://news.ycombinator.com/item?id=45284415",
    549         "created_at": "2025-09-18T02:27:54Z"
    550       },
    551       {
    552         "hn_id": "45350535",
    553         "title": "DeepMind Paper on Virtual Agent Economies",
    554         "points": 2,
    555         "comments": 0,
    556         "url": "https://news.ycombinator.com/item?id=45350535",
    557         "created_at": "2025-09-23T17:54:23Z"
    558       },
    559       {
    560         "hn_id": "45282518",
    561         "title": "Virtual Agent Economies",
    562         "points": 2,
    563         "comments": 0,
    564         "url": "https://news.ycombinator.com/item?id=45282518",
    565         "created_at": "2025-09-17T23:06:36Z"
    566       },
    567       {
    568         "hn_id": "45268153",
    569         "title": "Virtual Agent Economies",
    570         "points": 2,
    571         "comments": 0,
    572         "url": "https://news.ycombinator.com/item?id=45268153",
    573         "created_at": "2025-09-16T21:14:43Z"
    574       },
    575       {
    576         "hn_id": "45250763",
    577         "title": "Advancing Deep Search Agents with Knowledge Graphs and Multi-Turn RL",
    578         "points": 1,
    579         "comments": 0,
    580         "url": "https://news.ycombinator.com/item?id=45250763",
    581         "created_at": "2025-09-15T15:22:49Z"
    582       }
    583     ],
    584     "top_points": 89,
    585     "total_points": 210,
    586     "total_comments": 30
    587   }
    588 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs