scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24674B)
      1 {
      2   "paper": {
      3     "title": "Sequential Enumeration in Large Language Models",
      4     "authors": ["Kuinan Hou", "Marco Zorzi", "Alberto Testolin"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2512.04727",
      8     "doi": "10.48550/arXiv.2512.04727"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "LLMs can deploy counting procedures when explicitly prompted but none spontaneously engage in counting when simply asked to enumerate items. Proprietary models (GPT-5, Gemini 2.5) achieve near-ceiling accuracy with explicit counting on production tasks but still struggle with naming tasks. PCA analysis of Llama-70B hidden states reveals structured internal counter dynamics in mental counting conditions but not in explicit counting, suggesting fundamentally different computational strategies. Counting ability scales gradually with model size rather than emerging sharply.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper mentions models' responses are in pickle files and scripts in an 'anonymized drive' but no working URL or repository link is provided in the paper text."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "Data is described as stored in pickle files on an anonymized drive, but no permanent public URL or repository link is provided in the paper."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Hardware is mentioned (NVIDIA L40s, A100 GPUs) in Supplementary Section B, but no requirements.txt, Dockerfile, or library version details are provided."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions or README-style commands are provided. The paper describes methodology but not how to replicate it."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Error bars are shown in Figure 1 (binomial standard errors for accuracy, standard errors of the mean for MAE). Standard errors are also reported in neuronal correlation analysis."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Friedman non-parametric tests with Bonferroni-corrected pairwise comparisons are used (Section 4.1, Supplementary G.3). Mann-Whitney U tests are used for post-hoc comparisons."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Kendall's W effect sizes are reported for the Friedman test and rank-biserial correlation for pairwise comparisons (Supplementary G.3). E.g., GPT-5 W=0.02 (small), Llama70B W=0.37 (large)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for why 10 trials per numerosity per condition was chosen. No power analysis is discussed."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "NAE summary table (Table 1) reports mean ± standard deviation. Error bars in figures show standard errors. Spearman correlations include p-values."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Multiple models are compared against each other (GPT-5, GPT-4.1, Gemini 2.5, Llama variants, Qwen). Different prompting conditions serve as within-model baselines."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Models tested include GPT-5 (2025), Gemini 2.5 Pro, Llama 3.3 70B, and QwQ-32B — all recent as of the paper's writing."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The four prompting strategies (explicit, spontaneous, mental, forbid) serve as a systematic ablation of the counting instruction variable. Model size analysis (3B/8B/70B) ablates capacity."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Accuracy and MAE are both reported (Section 3.4). NAE (Normalized Absolute Error) is also used for scale-invariant comparison (Supplementary G.3)."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is not relevant to evaluating LLM counting accuracy — the ground truth is a deterministic count."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "This is not a learning/training study. Test sequences are generated fresh for evaluation. No train/test split concept applies."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by model, prompting condition, task type (naming/production), and stimulus type (letter/word) across Figures 1, 2, and 8."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 4.2 analyzes enumeration errors in detail. Supplementary G.1 provides specific examples of counting failures, including wrong repeating, wrong counting, and grouping errors."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The main finding is largely negative: LLMs cannot spontaneously count, naming task performance is poor even with explicit instructions, and no model achieves reliable enumeration across all conditions."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims about LLMs deploying counting when explicitly prompted but not spontaneously, and about the gap between neural and symbolic approaches, are supported by the results in Section 4."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims are limited and adequately supported. The prompting strategy manipulation is a controlled single-variable manipulation, and claims like 'explicit counting improves performance' are justified by the experimental design."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper specifies which models were tested (Section 3.1), limits claims to sequential enumeration of letters and words, and notes that counting is not strictly emergent but gradually improves (Section 4.1)."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 5 discusses alternative explanations including tokenization effects, surface-level token prediction strategies vs. internal counting, and the relationship to the theoretical framework of Yehudai et al. [53]."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper directly measures counting accuracy and MAE — the measurements match the claims at the same granularity. No proxy gap exists between what is measured and what is claimed."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Exact model versions are provided in Section 3.1: gpt-5-2025-08-07, gpt-4.1-2025-04-14, gemini-2.5-pro-preview-03-25, Llama-3.2-3B-Instruct, Llama-3.1-8B-Instruct, Llama-3.3-70B-Instruct, QwQ-32B."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full system prompt components are provided in Section 3.3 with exact instructional cues for each condition. User message examples are given. Response formatting details in Supplementary D."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Token limits are specified (Section E.1), but temperature, top-p, and other sampling parameters are not reported for any model."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. Models are prompted directly with system/user messages."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Sequence creation is documented in Supplementary C (word selection, homogeneous/heterogeneous stimuli). Response parsing pipeline is detailed in Supplementary F, including retry logic and Levenshtein distance checks."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations section. The Discussion (Section 5) addresses some limitations inline but does not have a separate limitations subsection."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No specific threats to validity are discussed. The discussion section does not address threats like invalid trial rates, token limit effects on results, or whether the prompting design may bias outcomes."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper is explicit about testing only sequential enumeration of letters and words (not visual counting), and notes that results are for specific models tested. Section 5 acknowledges the limitation to the tested computational strategies."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Raw data (pickle files, embeddings) is mentioned as being on an anonymized drive, but no permanent public URL is provided in the paper."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Detailed description of how test sequences are generated (Supplementary C), how responses are collected via APIs and local inference (Supplementary B), and token limits (Supplementary E)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. The study evaluates LLMs on generated test sequences."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from sequence generation to response collection to parsing to metric computation is documented across Sections 3 and Supplementary Sections C, E, and F, including invalid trial handling and retry logic."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgements section lists Italian Ministry of Education and Research PRIN Grant 2022EBC78W, European Union NextGenerationEU PNRR project GROUNDEEP, and China Scholarship Council support."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors are from University of Padova. They evaluate third-party models (GPT, Gemini, Llama, Qwen) — no conflict with their own products."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Funders are government/academic agencies (Italian MoE, EU, China Scholarship Council) with no financial interest in LLM counting performance outcomes."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates are stated for any of the models tested. This is relevant because models could have seen counting-related evaluation patterns in training."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether the test sequences or counting patterns could have appeared in training data."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The test sequences are freshly generated (reducing contamination risk), but this advantage is not explicitly discussed or framed as a contamination mitigation strategy."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No API costs, token consumption totals, or inference time are reported despite using both commercial APIs and GPU clusters."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Hardware is listed (L40s, A100 GPUs) but total GPU hours, API spend, or overall compute budget are not stated."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Random seeds for sequence generation and model sampling are not discussed. No seed sensitivity analysis is reported."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "10 trials per numerosity level are used (Supplementary D.1). Target numerosities range from 10 to 100 in increments of 10."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Response formatting was optimized for smaller Llama models (Supplementary D.1) but the search budget for this optimization is not reported."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "For response formatting optimization, the selection criterion is clearly stated: 'The response formatting instruction that yielded the highest proportion of valid outputs was selected' (Supplementary D.1)."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "Bonferroni correction is applied to pairwise comparisons following the Friedman test (Supplementary G.3, α = 0.0083)."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "The authors are not evaluating their own system — they are testing third-party LLMs on a counting task."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "Compute differences are negligible across conditions — the variable is prompting strategy, not compute."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "The paper carefully grounds the counting tasks in cognitive science literature (naming vs. production, how-many vs. give-N) and discusses what counting ability actually means (Section 1, Section 5), including one-to-one correspondence, stable order, and cardinality principle."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved. Models are prompted directly."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether models may have seen counting-related evaluations or similar tasks in training data."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the prompting setup leaks information about expected answers."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of independence between test sequences, though the sequences are randomly generated which mitigates this concern — but it is not explicitly discussed."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection or prevention method is used or discussed."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Some LLMs can deploy counting procedures when explicitly prompted but none spontaneously engage in counting when asked to enumerate items.",
    365       "evidence": "Figure 1 shows high accuracy for proprietary models in explicit counting (production task) but dramatic drops in spontaneous counting. Supplementary G.1 confirms only Gemini 2.5 Pro spontaneously counted in one single trial out of all tested.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Naming tasks (counting given items) are more challenging than production tasks (generating N items) even with explicit counting instructions.",
    370       "evidence": "Figure 1 shows consistently lower accuracy for naming vs. production across all models and conditions. Section 4.1 discusses this asymmetry.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Counting ability scales gradually with model size rather than emerging sharply.",
    375       "evidence": "Llama 3B→8B→70B shows accuracy increasing from ~0.10 to ~0.24 and MAE decreasing from >200 to ~70 (Supplementary G.2, Figure 7).",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Mental counting reveals structured internal counter dynamics (horseshoe pattern in PCA) while explicit counting uses fundamentally different token-based strategies.",
    380       "evidence": "PCA analysis (Sections 4.3, Figures 3-4) shows Spearman correlations >0.83 for PC1 with step number in mental counting, while explicit counting shows periodic patterns at decade multiples. Neuronal tuning analysis (Figure 5) confirms specialized populations in mental but not explicit counting.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Words are easier to count than letters due to tokenization effects.",
    385       "evidence": "Section 4.1 reports higher accuracy for word conditions across models, attributed to one-to-one token-word correspondence vs. variable letter tokenization. Supported by [54].",
    386       "supported": "moderate"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Missing hyperparameters",
    392       "detail": "Temperature, top-p, and other sampling parameters are not reported for any model. These can significantly affect counting behavior, especially for production tasks where sampling determines output."
    393     },
    394     {
    395       "flag": "Anonymized data availability",
    396       "detail": "Data and code are described as being on an 'anonymized drive' but no working URL is provided in the paper. This makes verification impossible."
    397     },
    398     {
    399       "flag": "No contamination discussion despite freshly generated tests",
    400       "detail": "The paper misses an opportunity to discuss that its randomly generated test sequences mitigate contamination concerns — and doesn't address whether models may have been trained on similar counting evaluations."
    401     },
    402     {
    403       "flag": "Small number of trials",
    404       "detail": "Only 10 trials per numerosity level without justification for this sample size. For some conditions, invalid trial rates further reduce effective sample sizes (Llama-3B: 75.8%, Qwen-32B: 87.5%)."
    405     }
    406   ],
    407   "cited_papers": [
    408     {
    409       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    410       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    411       "year": 2022,
    412       "arxiv_id": "2201.11903",
    413       "relevance": "Foundational work on CoT prompting, directly relevant to the paper's counting strategy manipulation."
    414     },
    415     {
    416       "title": "Emergent abilities of large language models",
    417       "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"],
    418       "year": 2022,
    419       "arxiv_id": "2206.07682",
    420       "relevance": "Defines emergent abilities framework that this paper tests for counting skills."
    421     },
    422     {
    423       "title": "Are emergent abilities of large language models a mirage?",
    424       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    425       "year": 2023,
    426       "relevance": "Challenges emergence concept — this paper's gradual scaling results align with the mirage thesis."
    427     },
    428     {
    429       "title": "When can transformers count to n?",
    430       "authors": ["Gilad Yehudai", "Haim Kaplan", "Asma Ghandeharioun", "Mor Geva", "Amir Globerson"],
    431       "year": 2024,
    432       "arxiv_id": "2407.15160",
    433       "relevance": "Theoretical analysis of transformer counting limitations that this paper extends empirically."
    434     },
    435     {
    436       "title": "Counting ability of large language models and impact of tokenization",
    437       "authors": ["Xiang Zhang", "Juntao Cao", "Chenyu You"],
    438       "year": 2024,
    439       "arxiv_id": "2410.19730",
    440       "relevance": "Studies tokenization impact on LLM counting — directly relevant finding confirmed by this paper."
    441     },
    442     {
    443       "title": "Why do large language models (LLMs) struggle to count letters?",
    444       "authors": ["Tianyu Fu", "Rodrigo Ferrando", "Javier Conde"],
    445       "year": 2024,
    446       "arxiv_id": "2412.18626",
    447       "relevance": "Directly studies LLM letter counting limitations, a core task in this paper."
    448     },
    449     {
    450       "title": "LLM the genius paradox: A linguistic and math expert's struggle with simple word-based counting problems",
    451       "authors": ["Nan Xu", "Xuemei Ma"],
    452       "year": 2024,
    453       "arxiv_id": "2410.14166",
    454       "relevance": "Studies LLM word counting failures, directly complementary to this paper's findings."
    455     },
    456     {
    457       "title": "Benchmarking GPT-4 on algorithmic problems: A systematic evaluation of prompting strategies",
    458       "authors": ["Francesco Petruzzellis", "Alberto Testolin", "Alessandro Sperduti"],
    459       "year": 2024,
    460       "arxiv_id": "2402.17396",
    461       "relevance": "Prior work by the same group on prompting strategies for algorithmic tasks in LLMs."
    462     },
    463     {
    464       "title": "Language models need inductive biases to count inductively",
    465       "authors": ["Yingshan Chang", "Yonatan Bisk"],
    466       "year": 2024,
    467       "arxiv_id": "2405.20131",
    468       "relevance": "Argues non-recurrent transformers lack needed inductive biases for counting."
    469     },
    470     {
    471       "title": "Embers of autoregression: Understanding large language models through the problem they are trained to solve",
    472       "authors": ["R. Thomas McCoy", "Shunyu Yao", "Dan Friedman"],
    473       "year": 2023,
    474       "arxiv_id": "2309.13638",
    475       "relevance": "Shows LLM counting is harder when correct output is low-probability text, relevant to understanding counting failures."
    476     }
    477   ]
    478 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs