scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25389B)
      1 {
      2   "paper": {
      3     "title": "Ask Me Anything: A Simple Strategy for Prompting Language Models",
      4     "authors": [
      5       "Simran Arora",
      6       "Avanika Narayan",
      7       "Mayee F. Chen",
      8       "Laurel Orr",
      9       "Neel Guha",
     10       "Kush Bhatia",
     11       "Ines Chami",
     12       "Frederic Sala",
     13       "Christopher Ré"
     14     ],
     15     "year": 2022,
     16     "venue": "International Conference on Learning Representations",
     17     "arxiv_id": "2210.02441",
     18     "doi": "10.48550/arXiv.2210.02441"
     19   },
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper provides a GitHub link: https://github.com/HazyResearch/ama_prompting. The Reproducibility Statement (Section 7) states: 'We release prompts and code for reproducing all benchmark results for few-shot and AMA prompting, and our diagnostic evaluation splits.'"
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "All 20 evaluation benchmarks are publicly available standard datasets (SuperGLUE, ANLI, AGNews, DBPedia, SST2, Amazon, DROP, NQ, WebQs, RealTimeQA, StoryCloze). The paper also releases diagnostic evaluation splits via the GitHub repository."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper mentions using A100 NVIDIA GPUs (Appendix A) and lists models downloaded from HuggingFace, but does not provide a requirements.txt, Dockerfile, or detailed dependency/library version listing."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "The Reproducibility Statement (Section 7) states code is released for 'reproducing all benchmark results.' The appendix (Section H) provides all prompts used for each of the 20 datasets, and Appendix A provides experiment details including model names and metrics."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Figure 5a reports 95% confidence intervals for the AMA lift across model sizes. Results in Tables 1 and 3 include ± notation (e.g., '83.9±0.0', '37.8±0.2') across 5 random seeds. The overall lift is reported as '10.2% ± 6.1% absolute.'"
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper claims AMA 'exceeds the average few-shot performance of the GPT3-175B model on 15 of 20 benchmarks' but provides no statistical significance tests for any comparative claims. Differences are assessed by comparing point estimates only."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The paper reports effect sizes with baseline context throughout: '10.2% ± 6.1% absolute (21.4% ± 11.2% relative) lift' (Section 5.2), '72% performance improvement' converting prompt formats (Section 3.2), and per-task absolute point differences in Tables 1 and 3."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The diagnostic tasks use only 50 manually-labeled samples per task (Appendix E), with no justification for why 50 is adequate. The number of random seeds (5) and prompt-chains (3-6) are stated but not justified."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Tables 1 and 3 report standard deviations across 5 random seeds for AMA results (e.g., '83.9±0.0', '37.8±0.2'). Figure 5a shows 95% confidence intervals across tasks."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The paper compares against few-shot baselines (k=3 for open-source models and k∈[32..70] for GPT-3), majority vote aggregation, weighted majority vote (Table 5), Self-Consistency (Table 7), and calibration methods (Zhao et al., 2021)."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Baselines include contemporary methods: GPT-3-175B few-shot (Brown et al., 2020), calibration (Zhao et al., 2021), Self-Consistency (Wang et al., 2022b), and T0 with PromptSource prompts (Sanh et al., 2022). These were all recent at time of submission."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Multiple ablation studies are presented: (1) Table 4 ablates prompt reformatting vs. aggregation, (2) Table 5 ablates aggregation methods (MV vs. WS vs. WMV vs. Pick Best vs. AMA no deps), (3) Figure 6 varies amount of unlabeled data, (4) Section 3.2 ablates prompt formats (restrictive vs. cloze vs. QA)."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The paper uses matching accuracy for most tasks, text F1 for DROP/RealTimeQA, span overlap accuracy for WebQ/NQ, and F1a accuracy for MultiRC (Appendix A.2). The information-theoretic conditional entropy H(y|ŷ) is also used as an evaluation lens (Section 4)."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No human evaluation of AMA's outputs is conducted. All evaluation is automated via benchmark metrics. For a prompting method that claims to improve LLM quality, human evaluation of output quality would be relevant but is absent."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Results are reported on standard test sets for all 20 benchmarks. The weak supervision algorithm uses an unlabeled dataset constructed from the test set plus 1000 training samples with labels ignored (Appendix A.3), maintaining proper separation."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down by task category (NLU, NLI, Classification, QA) in Tables 1 and 3, by model family in Section 5.2, by model size in Figure 5a, and per-class F1 in Table 10."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Appendix G provides an extended error analysis with three categories of failure modes: knowledge errors (factual and commonsense), instruction-following errors (invalid output class), and long-context errors. Concrete examples are provided for each."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper reports that AMA 'performs worse than MV by at most 1.0 points' on 4 of 20 tasks (Section 5.3), that the method has 'limitations as seen on NQ' (Section 5.1), and that T0 sees lower lift because fine-tuning 'may compromise its in-context learning abilities' (Section 5.2)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims 'average performance lift of 10.2% over the few-shot baseline' and 'GPT-J-6B model to match and exceed the performance of few-shot GPT3-175B on 15 of 20 popular benchmarks' — both supported by Tables 1 and 3 and Section 5.2."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper makes causal claims through ablation studies (e.g., removing QA reformatting or changing aggregation method) which are controlled single-variable manipulations. Section 3.2 isolates prompt format effects, Table 4 separates reformatting from aggregation, and Table 5 isolates aggregation method effects."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The paper bounds its claims to tested settings: 'across open-source model families (EleutherAI, BLOOM, OPT, and T0) and sizes (125M-175B parameters)' (abstract). The conclusion notes limitations for 'private data' applications. Section 5.1 distinguishes between context-dependent and knowledge-intensive tasks."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Section 3.2 and Appendix F investigate why QA prompts are effective by analyzing pretraining corpus frequencies, offering an empirical alternative explanation. The paper discusses that T0's lower lift may be due to fine-tuning on zero-shot prompts. The error analysis (Appendix G) discusses knowledge limitations as confounds."
    137       }
    138     },
    139     "setup_transparency": {
    140       "model_versions_specified": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Appendix A.1 specifies exact model variants: GPT-Neo-125M, GPT-Neo-1.3B, GPT-J-6B, GPT-NeoX-20B, BLOOM-560M, BLOOM-1.7B, BLOOM-7.1B, BLOOM-176B, OPT-125M, OPT-1.3B, OPT-6.7B, OPT-13B, OPT-175B, T0-3B, and GPT-3 davinci endpoint. These are specific model identifiers."
    144       },
    145       "prompts_provided": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Appendix H provides the full prompt text for all 20 datasets, including both few-shot and AMA prompt()-chains with actual in-context demonstrations. The prompts are released in the code repository as well."
    149       },
    150       "hyperparameters_reported": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "Temperature and sampling parameters are not reported for the main experiments. For Self-Consistency comparisons (Table 7), temperatures are listed (0.0, 0.3, 0.5, 0.6, 0.7), but the main AMA experiments lack these details. The WS structure learning uses 'default parameters from Varma et al. [2019]' without specifying them."
    154       },
    155       "scaffolding_described": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "The two-step prompt()-chain pipeline is described in detail: question() prompts transform inputs to questions (Section 3.3), answer() prompts generate answers, and weak supervision aggregates votes (Section 3.4). Figure 1 provides a visual workflow diagram."
    159       },
    160       "data_preprocessing_documented": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Appendix H documents dataset sizes (train/test splits) for each benchmark. Appendix A.2 documents the metrics used per task. The mapping from open-ended answers to output classes is described in Section 3.2. Appendix A.3 describes the unlabeled dataset construction."
    164       }
    165     },
    166     "limitations_and_scope": {
    167       "limitations_section_present": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "While there is no dedicated 'Limitations' section, Sections 5.1 and 6 discuss limitations substantively. The error analysis in Appendix G serves as a de facto limitations discussion covering knowledge gaps, instruction-following failures, and long-context issues."
    171       },
    172       "threats_to_validity_specific": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "The paper identifies specific threats: (1) AMA provides less lift on knowledge-intensive tasks where factual recall is needed (Section 5.1), (2) T0 sees lower lift due to fine-tuning compromising in-context learning (Section 5.2), (3) instruction-following failures on multi-class tasks (Appendix G), (4) long-context degradation (Appendix G)."
    176       },
    177       "scope_boundaries_stated": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 5.1 explicitly states: 'we find limitations when tasks cannot rely on the latent knowledge' and gives specific examples (NQ, RealTimeQA, Amazon Instant Video). The conclusion acknowledges applicability constraints for 'private data or require operating over large amounts of data.' The error analysis categorizes what AMA does not handle."
    181       }
    182     },
    183     "data_integrity": {
    184       "raw_data_available": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "All 20 benchmark datasets are publicly available. The code repository is released for reproducing results, and diagnostic evaluation splits are also released. Per-task results are reported in tables."
    188       },
    189       "data_collection_described": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "All datasets used are standard benchmarks with published collection procedures (cited in Appendix H). The diagnostic tasks (Appendix E) are described with their construction methodology and 50 manually-labeled samples per task."
    193       },
    194       "recruitment_methods_described": {
    195         "applies": false,
    196         "answer": false,
    197         "justification": "No human participants are involved. All evaluation uses standard NLP benchmark datasets and automated metrics."
    198       },
    199       "data_pipeline_documented": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "The pipeline is documented: input → question() prompt → LLM generates question → answer() prompt → LLM generates answer → map to output space → weak supervision aggregation (Section 3.3, 3.4, Algorithm 1). Appendix A.3 documents WS data construction."
    203       }
    204     },
    205     "conflicts_of_interest": {
    206       "funding_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The Acknowledgements section discloses extensive funding: DARPA, NIH, NSF, ONR, Moore Foundation, and numerous industry sponsors. Individual fellowship support is listed for SA and LO."
    210       },
    211       "affiliations_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Author affiliations are clearly listed: Stanford University (most authors), Numbers Station (Ines Chami), and University of Wisconsin-Madison (Frederic Sala). The Acknowledgements also mention Together Computer, Numbers Station, and Snorkel."
    215       },
    216       "funder_independent_of_outcome": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Funding comes from government agencies (DARPA, NIH, NSF, ONR) and academic foundations, which do not have a direct stake in AMA outperforming specific prompting methods. While industry sponsors are listed, the paper evaluates open-source models, not products from those sponsors."
    220       },
    221       "financial_interests_declared": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No competing interests statement is provided. Christopher Ré is associated with Snorkel (mentioned in acknowledgements), which is based on weak supervision — the same technique central to AMA. This potential conflict is not disclosed."
    225       }
    226     },
    227     "contamination": {
    228       "training_cutoff_stated": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "The paper does not state the training data cutoff dates for any of the models used (GPT-J, BLOOM, OPT, T0, GPT-3 davinci), despite evaluating them on public benchmarks that could be in their training data."
    232       },
    233       "train_test_overlap_discussed": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No discussion of potential train/test overlap. Many benchmarks used (SuperGLUE, ANLI, etc.) were publicly available before model training and could have been included in training data, but this is not addressed."
    237       },
    238       "benchmark_contamination_addressed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "The paper evaluates models on benchmarks like SuperGLUE (2019), ANLI (2020), and others that were publicly available before model training. No contamination analysis is provided. This is especially relevant since the GPT-3 comparison uses 'davinci' which was likely trained on data including these benchmarks."
    242       }
    243     },
    244     "human_studies": {
    245       "pre_registered": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved in this study. All evaluation is automated using benchmark datasets."
    249       },
    250       "irb_or_ethics_approval": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved."
    254       },
    255       "demographics_reported": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved."
    259       },
    260       "inclusion_exclusion_criteria": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved."
    264       },
    265       "randomization_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved."
    269       },
    270       "blinding_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants are involved."
    274       },
    275       "attrition_reported": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants are involved."
    279       }
    280     },
    281     "cost_and_practicality": {
    282       "inference_cost_reported": {
    283         "applies": true,
    284         "answer": true,
    285         "justification": "Table 6 reports total inference cost in seconds for GPT-J-6B on three tasks (RTE: 8310s, WSC: 3141s, AGNews: 53200s). Appendix B.3 reports WS latency (13.0s without dependencies, 84.3s with dependencies)."
    286       },
    287       "compute_budget_stated": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "The paper mentions using A100 GPUs (Appendix A) and that computation was provided by Together Computer, but does not report total GPU hours, total API spend for GPT-3 davinci calls, or overall computational budget."
    291       }
    292     }
    293   },
    294   "claims": [
    295     {
    296       "claim": "AMA enables the open-source GPT-J-6B model to match or exceed the few-shot GPT3-175B on 15 of 20 popular benchmarks.",
    297       "evidence": "Table 1 shows per-task comparisons where GPT-J-6B with AMA (QA + WS) surpasses GPT-3-175B few-shot (k∈[32..70]) on 15 tasks. Section 5.1 discusses this result.",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "AMA provides an average performance lift of 10.2% ± 6.1% absolute (21.4% ± 11.2% relative) over few-shot baseline across models.",
    302       "evidence": "Section 5.2 reports this figure across 14 LLMs from 4 model families on 7 tasks. Figure 5a shows the lift by model size with 95% confidence intervals.",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "Open-ended QA prompt formats outperform restrictive prompt formats.",
    307       "evidence": "Section 3.2 shows converting CB, RTE, WSC from restrictive to open-ended formats improves average performance from 41.7% to 71.5% (+72%) on GPT-J-6B. Pretraining corpus analysis (Appendix F) shows QA structures appear ~1000x more frequently.",
    308       "supported": "strong"
    309     },
    310     {
    311       "claim": "Weak supervision aggregation outperforms majority vote by up to 8.7 points.",
    312       "evidence": "Section 5.3 and Table 5 show WS vs. MV comparisons across 16 tasks. WS does not underperform MV by more than 1.0 point on any task while achieving up to 8.7 points improvement.",
    313       "supported": "strong"
    314     },
    315     {
    316       "claim": "WS recovers dependencies among prompts to boost performance by up to 9.6 points on 9 tasks.",
    317       "evidence": "Section 5.3 states this, comparing AMA (WS) with AMA (no deps). Table 5 provides per-task numbers showing RTE improves from 65.1 to 75.1 with dependency modeling.",
    318       "supported": "moderate"
    319     }
    320   ],
    321   "methodology_tags": [
    322     "benchmark-eval"
    323   ],
    324   "key_findings": "AMA (Ask Me Anything) is a prompting strategy that reformats task inputs into open-ended QA formats using the LLM itself, then aggregates multiple prompt outputs via weak supervision. The method achieves an average 10.2% absolute lift over few-shot baselines across 14 open-source LLMs from 4 families (125M-175B parameters) on 20 benchmarks. Most notably, GPT-J-6B (6B parameters) with AMA matches or exceeds few-shot GPT-3-175B on 15 of 20 tasks. The gains are largest on context-dependent tasks (reading comprehension) and smaller on knowledge-intensive tasks (factual recall).",
    325   "red_flags": [
    326     {
    327       "flag": "No contamination analysis",
    328       "detail": "All benchmarks (SuperGLUE, ANLI, etc.) were publicly available before model training. No analysis of potential train/test overlap is provided, which could inflate benchmark numbers for all models tested."
    329     },
    330     {
    331       "flag": "Unfair shot comparison with GPT-3",
    332       "detail": "The GPT-3-175B baseline uses k∈[32..70] shots from Brown et al. (2020), while AMA uses 3-6 prompt()-chains plus unlabeled data from the test set and 1000 training samples. The comparison is between different prompting paradigms with different information access, which is not always clearly flagged."
    333     },
    334     {
    335       "flag": "Snorkel conflict of interest undisclosed",
    336       "detail": "Christopher Ré is a co-founder of Snorkel AI, which commercializes weak supervision — the core aggregation technique in AMA. Snorkel is listed in the acknowledgements but this conflict of interest is not formally disclosed."
    337     },
    338     {
    339       "flag": "No significance tests for comparative claims",
    340       "detail": "Claims that AMA 'exceeds' GPT-3-175B on 15/20 tasks are based on point estimate comparisons without statistical tests. Some margins are very small (e.g., StoryCloze 87.8 vs 87.7) and could be within noise."
    341     }
    342   ],
    343   "cited_papers": [
    344     {
    345       "title": "Language models are few-shot learners",
    346       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    347       "year": 2020,
    348       "relevance": "Foundational GPT-3 paper establishing few-shot prompting as a paradigm; provides the primary baselines for AMA comparison."
    349     },
    350     {
    351       "title": "Calibrate before use: Improving few-shot performance of language models",
    352       "authors": ["Tony Z. Zhao", "Eric Wallace", "Shi Feng", "Dan Klein", "Sameer Singh"],
    353       "year": 2021,
    354       "arxiv_id": "2102.09690",
    355       "relevance": "Addresses prompt sensitivity and calibration methods for LLMs; complementary approach to managing prompt brittleness."
    356     },
    357     {
    358       "title": "Self-consistency improves chain of thought reasoning in language models",
    359       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"],
    360       "year": 2022,
    361       "relevance": "Prompt aggregation method via sampling diverse reasoning paths; directly compared against AMA in Table 7."
    362     },
    363     {
    364       "title": "Chain of thought prompting elicits reasoning in large language models",
    365       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    366       "year": 2022,
    367       "arxiv_id": "2201.11903",
    368       "relevance": "Foundational prompting strategy for LLM reasoning; AMA offers an alternative approach to improving LLM performance."
    369     },
    370     {
    371       "title": "Multitask prompted training enables zero-shot task generalization",
    372       "authors": ["Victor Sanh", "Albert Webson", "Colin Raffel"],
    373       "year": 2022,
    374       "relevance": "Introduces T0 model and PromptSource; used as a baseline model family and prompt source in AMA evaluation."
    375     },
    376     {
    377       "title": "Training language models to follow instructions with human feedback",
    378       "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"],
    379       "year": 2022,
    380       "arxiv_id": "2203.02155",
    381       "relevance": "InstructGPT/RLHF approach to improving LLM following; complementary training-based strategy vs. AMA's inference-time approach."
    382     },
    383     {
    384       "title": "Snorkel: Rapid training data creation with weak supervision",
    385       "authors": ["Alexander Ratner", "Stephen H. Bach", "Henry Ehrenberg"],
    386       "year": 2017,
    387       "doi": "10.14778/3157794.3157797",
    388       "relevance": "Core weak supervision framework that AMA adapts for prompt aggregation; foundational to AMA's aggregation strategy."
    389     },
    390     {
    391       "title": "Emergent abilities of large language models",
    392       "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"],
    393       "year": 2022,
    394       "arxiv_id": "2206.07682",
    395       "relevance": "Studies scaling properties of LLMs; AMA demonstrates prompt aggregation as an alternative axis for scaling performance."
    396     },
    397     {
    398       "title": "OPT: Open Pre-trained Transformer Language Models",
    399       "authors": ["Susan Zhang", "Stephen Roller", "Naman Goyal"],
    400       "year": 2022,
    401       "arxiv_id": "2205.01068",
    402       "relevance": "Open-source LLM family used in AMA evaluation; relevant to assessing prompting methods across model families."
    403     },
    404     {
    405       "title": "On the opportunities and risks of foundation models",
    406       "authors": ["Rishi Bommasani", "Drew A Hudson", "Ehsan Adeli"],
    407       "year": 2021,
    408       "arxiv_id": "2108.07258",
    409       "relevance": "Comprehensive survey of foundation model capabilities and risks; frames the broader context of LLM prompting research."
    410     },
    411     {
    412       "title": "Scaling laws for neural language models",
    413       "authors": ["Jared Kaplan", "Sam McClandlish", "Tom Henighan"],
    414       "year": 2020,
    415       "arxiv_id": "2001.08361",
    416       "relevance": "Establishes scaling laws for LLMs; AMA suggests prompt aggregation as another axis beyond parameter scale."
    417     }
    418   ]
    419 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs