scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (18797B)
      1 {
      2   "paper": {
      3     "title": "Attention Is All You Need",
      4     "authors": ["Ashish Vaswani", "Noam Shazeer", "Niki Parmar", "Jakob Uszkoreit", "Llion Jones", "Aidan N. Gomez", "Łukasz Kaiser", "Illia Polosukhin"],
      5     "year": 2017,
      6     "venue": "NeurIPS 2017 (NIPS 2017)",
      7     "arxiv_id": "1706.03762"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper provides a GitHub link: https://github.com/tensorflow/tensor2tensor (Section 7, Conclusion)."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses standard public benchmarks: WMT 2014 English-German, WMT 2014 English-French, and Penn Treebank WSJ, all publicly available."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions 8 NVIDIA P100 GPUs (Section 5.2) but provides no dependency specifications, library versions, or environment setup instructions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The code link is given but no README-level guidance for replicating specific experiments."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results in Tables 2, 3, and 4 are reported as point estimates without confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims state-of-the-art results by comparing BLEU scores directly without any statistical significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports improvements with baseline context, e.g., 'improving over the existing best results, including ensembles, by over 2 BLEU' (abstract), and Table 2 provides full baseline comparisons enabling effect size assessment."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for dataset sizes or discussion of whether the training data is sufficient. Standard benchmarks are used but no power or sample size analysis."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or multi-run results are reported. All results appear to be single-run numbers."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Table 2 compares against ByteNet, GNMT+RL, ConvS2S, MoE, and their ensemble variants. Table 4 compares against multiple parsing baselines."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include ConvS2S (2017), MoE (2017), GNMT+RL (2016) — all contemporary or near-contemporary at time of publication."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Table 3 provides extensive ablations varying number of attention heads (A), key dimension (B), model size (C), dropout and label smoothing (D), and positional encoding (E)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper reports both BLEU and perplexity (PPL) for translation experiments (Table 3), and F1 for constituency parsing (Table 4)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of translation quality is included. All evaluation is automated (BLEU, PPL, F1). For a machine translation paper claiming state-of-the-art quality, human evaluation is relevant."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Main results (Table 2) are on newstest2014 test set. Ablations (Table 3) are explicitly on newstest2013 development set. Parsing results on Section 23 of WSJ (standard test split)."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by task (EN-DE, EN-FR translation, constituency parsing) and by model variant (Table 3 ablations)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No failure cases or error analysis is presented. The paper does not discuss where the Transformer fails or produces poor translations."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Table 3 shows configurations that hurt performance: single attention head loses 0.9 BLEU (row A), reducing dk hurts quality (row B), removing dropout (row D) hurts. Label smoothing 'hurts perplexity' (Section 5.4)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims of 28.4 BLEU on EN-DE and 41.8 on EN-FR are supported by Table 2. Training time of 3.5 days on 8 GPUs is stated in Section 5.2. Constituency parsing claim is supported by Table 4."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about component contributions are supported by controlled ablation studies in Table 3 (single-variable manipulations of heads, dimensions, dropout, etc.)."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title 'Attention Is All You Need' and abstract claim that the Transformer 'generalizes well to other tasks' based on only one additional task (English constituency parsing) overstate the evidence. The paper tests on two translation language pairs and one parsing task."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No discussion of alternative explanations for the performance gains. Could the improvement be due to increased parallelism enabling better hyperparameter tuning rather than architectural superiority? No confounds are discussed."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "This paper proposes its own architecture; it does not evaluate pre-existing versioned models/APIs."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No prompting is used. This is a neural architecture paper with standard supervised training."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Comprehensive hyperparameters: N=6 layers, dmodel=512, dff=2048, h=8 heads, dk=dv=64, Pdrop=0.1, εls=0.1, Adam with β1=0.9, β2=0.98, warmup=4000, beam size=4, length penalty α=0.6 (Sections 3, 5, 6)."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. This is a neural network architecture paper."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 5.1 describes byte-pair encoding with ~37K shared vocabulary for EN-DE, 32K word-piece vocabulary for EN-FR, batching by approximate sequence length with ~25K source and target tokens per batch."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No dedicated limitations or threats-to-validity section exists in the paper."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity are discussed anywhere in the paper."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit scope boundaries are stated. The conclusion mentions plans to extend to other modalities but does not explicitly bound what the current results do NOT show."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "WMT 2014 and Penn Treebank are standard public benchmarks with publicly available data."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 5.1 describes the datasets: WMT 2014 EN-DE (~4.5M sentence pairs), WMT 2014 EN-FR (36M sentences), Penn Treebank WSJ (~40K training sentences)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data is from standard public benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 5.1 documents the pipeline: raw sentences → byte-pair/word-piece encoding → batching by approximate sequence length. Checkpoint averaging and beam search parameters are documented in Section 6.1."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding source is disclosed. The paper has an Acknowledgements section thanking individuals but no funding statement. All authors are affiliated with Google Brain/Research or University of Toronto."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: Google Brain, Google Research, University of Toronto. Footnotes note work performed while at Google."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence cannot be assessed. Google employees proposing a Google architecture have an inherent interest in its success."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper trains its own models from scratch on specified datasets; it does not evaluate a pre-trained model's knowledge on a benchmark."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Not applicable — the model is trained from scratch on specified training data with standard train/dev/test splits."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not applicable — no pre-trained model is evaluated on benchmarks that could have been in its training data."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost, latency, or tokens-per-second metrics are reported."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Training compute is well documented: base model trained for 12 hours on 8 P100 GPUs (100K steps), big model for 3.5 days (300K steps). Table 2 provides estimated FLOPs for all models."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The Transformer (big) achieves 28.4 BLEU on WMT 2014 English-to-German, improving over existing best results including ensembles by over 2 BLEU.",
    286       "evidence": "Table 2 shows Transformer (big) at 28.4 BLEU vs. previous best ensemble (GNMT+RL Ensemble) at 26.30 BLEU.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "The Transformer (big) achieves a new single-model state-of-the-art BLEU score of 41.8 on WMT 2014 English-to-French.",
    291       "evidence": "Table 2 shows 41.8 BLEU. However, Section 6.1 reports 41.0 BLEU for this task, creating an inconsistency with the abstract's 41.8 claim.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "The Transformer requires significantly less training cost than competitive models.",
    296       "evidence": "Table 2 shows Transformer (big) EN-DE at 2.3×10^19 FLOPs vs. GNMT+RL Ensemble at 1.8×10^20 FLOPs — roughly 8x less. Base model at 3.3×10^18 is even cheaper.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "The Transformer generalizes well to English constituency parsing.",
    301       "evidence": "Table 4 shows 91.3 F1 (WSJ only) and 92.7 F1 (semi-supervised), competitive with or exceeding most baselines except RNNG generative (93.3).",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Multi-head attention is beneficial; single-head attention is 0.9 BLEU worse.",
    306       "evidence": "Table 3 row (A) shows single-head at 24.9 BLEU vs. base (8 heads) at 25.8 BLEU on the dev set.",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "The paper introduces the Transformer architecture, which relies entirely on self-attention mechanisms and dispenses with recurrence and convolution. On WMT 2014 machine translation, it achieves state-of-the-art BLEU scores (28.4 EN-DE, 41.8 EN-FR) while requiring a fraction of the training compute of competitive models. Ablation studies demonstrate the importance of multi-head attention, model size, and regularization. The architecture also transfers to English constituency parsing with competitive results.",
    312   "red_flags": [
    313     {
    314       "flag": "No variance or multi-run results",
    315       "detail": "All results appear to be single runs with no error bars, standard deviations, or confidence intervals. For a paper making state-of-the-art claims, this makes it impossible to assess whether observed differences are within noise."
    316     },
    317     {
    318       "flag": "Abstract/body BLEU inconsistency on EN-FR",
    319       "detail": "The abstract claims 41.8 BLEU on EN-FR, but Section 6.1 states 'our big model achieves a BLEU score of 41.0' for the same task. This discrepancy is unexplained."
    320     },
    321     {
    322       "flag": "Overly broad title and generalization claims",
    323       "detail": "The title 'Attention Is All You Need' and claims of generalization are based on only two translation language pairs and one parsing task. The scope of evidence is narrower than the scope of claims."
    324     },
    325     {
    326       "flag": "No limitations section",
    327       "detail": "The paper contains no discussion of limitations, threats to validity, or scope boundaries despite making broad architectural claims."
    328     },
    329     {
    330       "flag": "Conflict of interest: Google evaluating Google architecture",
    331       "detail": "All authors are Google employees (with one University of Toronto affiliate working at Google). The paper proposes and evaluates a Google-developed architecture with no independent evaluation or conflict disclosure."
    332     }
    333   ],
    334   "cited_papers": [
    335     {
    336       "title": "Neural machine translation by jointly learning to align and translate",
    337       "authors": ["Dzmitry Bahdanau", "Kyunghyun Cho", "Yoshua Bengio"],
    338       "year": 2014,
    339       "arxiv_id": "1409.0473",
    340       "relevance": "Foundational attention mechanism paper for sequence-to-sequence models, directly motivating the Transformer's attention-only approach."
    341     },
    342     {
    343       "title": "Convolutional sequence to sequence learning",
    344       "authors": ["Jonas Gehring", "Michael Auli", "David Grangier", "Denis Yarats", "Yann N. Dauphin"],
    345       "year": 2017,
    346       "arxiv_id": "1705.03122",
    347       "relevance": "Key baseline: convolutional approach to sequence transduction that the Transformer outperforms."
    348     },
    349     {
    350       "title": "Google's neural machine translation system: Bridging the gap between human and machine translation",
    351       "authors": ["Yonghui Wu", "Mike Schuster", "Zhifeng Chen"],
    352       "year": 2016,
    353       "arxiv_id": "1609.08144",
    354       "relevance": "GNMT is a primary baseline; represents the recurrent approach the Transformer replaces."
    355     },
    356     {
    357       "title": "Outrageously large neural networks: The sparsely-gated mixture-of-experts layer",
    358       "authors": ["Noam Shazeer", "Azalia Mirhoseini", "Krzysztof Maziarz"],
    359       "year": 2017,
    360       "arxiv_id": "1701.06538",
    361       "relevance": "Mixture-of-experts baseline in Table 2; relevant to scaling and efficiency of large models."
    362     },
    363     {
    364       "title": "Sequence to sequence learning with neural networks",
    365       "authors": ["Ilya Sutskever", "Oriol Vinyals", "Quoc VV Le"],
    366       "year": 2014,
    367       "relevance": "Foundational encoder-decoder architecture that the Transformer builds upon and replaces recurrence in."
    368     },
    369     {
    370       "title": "Deep residual learning for image recognition",
    371       "authors": ["Kaiming He", "Xiangyu Zhang", "Shaoqing Ren", "Jian Sun"],
    372       "year": 2016,
    373       "relevance": "Residual connections used in the Transformer architecture are adopted from this work."
    374     }
    375   ]
    376 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs