scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24442B)
      1 {
      2   "paper": {
      3     "title": "BAMBOO: A Comprehensive Benchmark for Evaluating Long Text Modeling Capacities of Large Language Models",
      4     "authors": [
      5       "Zican Dong",
      6       "Tianyi Tang",
      7       "Junyi Li",
      8       "Wayne Xin Zhao",
      9       "Ji-Rong Wen"
     10     ],
     11     "year": 2023,
     12     "venue": "arXiv",
     13     "arxiv_id": "2309.13345"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The paper explicitly states in the abstract: 'We release our data, prompts, and code at https://github.com/RUCAIBox/BAMBOO.' A concrete GitHub URL is provided."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The benchmark datasets are released via the same GitHub repository. The abstract states 'We release our data, prompts, and code.' Table 2 also lists the public source URLs for each dataset."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No mention of requirements.txt, Dockerfile, conda environment, or specific library versions in the paper. The paper does not describe the environment setup needed to reproduce the experiments."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper mentions that users can 'utilize the BAMBOO benchmark with instructions provided in our project repository' (Section 3.5), but the paper itself does not contain step-by-step reproduction instructions. It defers to the repository, and the paper itself lacks sufficient detail."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "All results in Tables 3-7 are reported as point estimates without confidence intervals or error bars."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper makes comparative claims (e.g., 'ChatGPT-16k consistently demonstrates optimal performance') but uses no statistical significance tests. Comparisons are based solely on raw numbers."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Results are reported as raw percentages without formal effect sizes (no Cohen's d, odds ratios, etc.). While absolute performance numbers are given, there is no systematic reporting of magnitude of differences with baseline context."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The benchmark contains 100-200 examples per dataset (Table 2) but no justification is given for why these sizes are adequate for the claims being made. No power analysis is discussed."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "Results appear to be from single runs with no standard deviation, variance, or spread measure reported across experimental runs."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The paper includes random baselines for each task (Section 4.1) and compares five long-context LLMs against each other. Table 3 shows random baseline performance."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The five models tested (gpt-3.5-turbo-16k, Claude2-100k, ChatGLM2-6b-32k, Vicuna-7b-v1.5-16k, Longchat-7b-v1.5-32k) were all contemporary long-context models at the time of the study (2023)."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper includes multiple analytical experiments that function as ablations: RQ1 (extension tax comparison), RQ2 (evidence vs. complete text), RQ3 (instruction positions), RQ4 (context compression methods), and RQ5 (evidence position effects). These systematically vary individual factors."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Multiple metrics are used across different tasks: accuracy, concordance index, pass@1, and precision/recall/F1 (Table 2, Section 3.4)."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "The paper explicitly avoids human evaluation, relying entirely on automatic metrics. The paper argues for 'accurate automatic evaluation' as a design principle (Section 3.1). While human annotators helped construct the benchmark, no human evaluation of model outputs is performed."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "There is no mention of a dev/test split. The benchmark appears to be used as a single evaluation set without any distinction between tuning and evaluation data. The paper does not describe holding out any portion of the data."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Table 3 provides per-dataset breakdowns across all 10 datasets and both length levels (4k and 16k). Performance is shown individually for each task rather than as a single aggregate."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Section 4.4 'Discussions' identifies specific failure modes: instruction forgetting, format errors, poor reasoning beyond long texts, and poor performance on uncommon tasks. Qualitative error patterns are described."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Several negative results are reported: models performing below random baselines on some tasks (Table 3, e.g., Longchat on ShowsPred), the 'extension tax' hurting short-text performance (RQ1), and summarization/truncation methods often performing poorly (RQ4, Table 6)."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract's claims about comprehensive evaluation, data contamination avoidance, and key findings (extending context is a double-edged sword, evidence/compression helps, instruction position sensitivity) are supported by results in Tables 3-7 and the RQ analyses in Section 4.3."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper makes causal-style claims such as 'extending the context window...is beneficial for medium-length texts but harmful for short texts' and 'limited diversity of training data types plays a role in the subpar performance.' These are causal claims (X causes Y) but the study design (comparing different models, not controlled manipulation of training data) does not adequately support causal inference. Confounds between models are not addressed."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The title claims 'Comprehensive Benchmark for Evaluating Long Text Modeling Capacities of Large Language Models' broadly, but results are from only 5 models (2 closed-source, 3 open-source 7B models). The paper does not bound its generalizations to these specific models. Claims like 'LLMs perform poorly on uncommon tasks' generalize beyond the tested set."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper presents interpretations for its findings (e.g., attributing poor performance to 'limited diversity of training data') without discussing alternative explanations. For example, the 'extension tax' could be due to model capacity, not just position interpolation, but alternatives are not considered."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper uses 'gpt-3.5-turbo-16k' and 'Claude2-100K' without specifying exact API snapshots or version dates. For open models, 'Vicuna-7b-v1.5-16k', 'Longchat-7b-v1.5-32k', and 'ChatGLM2-6b-32k' are more specific but no checkpoint hashes or exact dates are provided. The closed-source models lack snapshot identifiers."
    139       },
    140       "prompts_provided": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The abstract states 'We release our data, prompts, and code.' The prompts are included as part of the released artifacts. The paper describes the task formats in detail (Section 3.3) and prompts are part of the released repository."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "No temperature, top-p, max tokens, or other generation hyperparameters are reported in the paper. For API-based models (GPT-3.5-turbo, Claude2), no sampling settings are specified."
    149       },
    150       "scaffolding_described": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "No agentic scaffolding is used. The benchmark evaluates LLMs directly on tasks via single-turn prompting."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 3.2 describes data collection: sources are specified, text extraction excludes images/tables/footnotes, texts under 1000 tokens are excluded, long texts are truncated. Section 3.3 details how each dataset was constructed. The process from raw sources to final datasets is documented."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "There is no dedicated Limitations or Threats to Validity section. Section 4.4 'Discussions' discusses problems of LLMs but not limitations of the benchmark or study itself."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "No specific threats to validity of the benchmark or evaluation methodology are discussed. The paper acknowledges that 'we cannot guarantee the absence of any pre-training data used in our benchmark' (Section 3.1) but does not address other threats such as small sample sizes per task or limited model diversity."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "The paper does not explicitly state what the results do NOT show. There are no explicit boundary statements about the generalizability of findings to other models, languages, or task types beyond those tested."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The data is released via GitHub (https://github.com/RUCAIBox/BAMBOO) as stated in the abstract. Table 2 also lists original source URLs for each dataset, enabling independent verification."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 3.2 describes data collection: four source types (NLP papers, government reports, TV show transcripts, committee meeting transcripts), all from 2023, with specific URLs listed in Table 2. Section 3.3 details construction of each of the 10 datasets."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The paper mentions 'With the assistance of human labelers, we manually construct two multi-choice question answering datasets' (Section 3.3) but does not describe who these annotators were, how they were recruited, or what instructions they received beyond rephrasing expressions."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The pipeline from raw sources to final datasets is documented: collection from specific websites (Section 3.2), parsing and text extraction, filtering by length (>1000 tokens), truncation of long texts, and division into BAMBOO-4k and BAMBOO-16k subsets. Table 2 provides final counts."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Section 6 'Acknowledgments' discloses funding: 'National Natural Science Foundation of China under Grant No. 62222215, Beijing Natural Science Foundation under Grant No. 4222027 and L233008.'"
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Author affiliations are listed on the first page: Renmin University of China (Gaoling School of AI, School of Information) and Université de Montréal (DIRO). The authors are academic researchers not affiliated with the companies whose models they evaluate."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "The funders (National Natural Science Foundation of China, Beijing Natural Science Foundation) are government research agencies with no financial stake in the outcome of the benchmark evaluation."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No competing interests or financial interests statement is included in the paper."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Section 3.1 states: 'training data cutoff for gpt-3.5-turbo and Claude2 are up to September 2021 and the early 2023, respectively.' Training cutoffs are explicitly discussed as part of the contamination avoidance design."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "Data contamination is a core design principle. Section 3.1 extensively discusses the issue: data sources are retained only from 2023, answer modifications prevent memorization, and the paper acknowledges 'we cannot guarantee the absence of any pre-training data used in our benchmark.'"
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "The entire benchmark is designed to address contamination: using 2023 data sources, modifying answers in AltQA to prevent memorization, and constructing novel datasets (SenHallu, AbsHallu, PaperQA, MeetingQA). This is discussed in Sections 3.1 and 3.2."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants in the study. Human annotators helped construct the benchmark but were not study subjects."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants in the study. This is a benchmark evaluation of LLMs."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in the study."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in the study."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in the study."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in the study."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in the study."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No inference costs, API costs, tokens consumed, or wall-clock time are reported for running the benchmark evaluations across the five models."
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "No total computational budget, GPU hours, API spend, or hardware details are reported for the experiments."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "ChatGPT-16k consistently demonstrates optimal performance across most datasets, surpassing other LLMs in nearly all tasks.",
    292       "evidence": "Table 3 shows ChatGPT-16k achieving the highest or near-highest scores on 8 out of 10 datasets across both length levels, with only a minor gap in hallucination detection where Claude2 slightly outperforms it on F1.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "Extending the context window of LLMs is a double-edged sword: beneficial for medium-length texts but harmful for short texts.",
    297       "evidence": "Figure 1 (Section 4.3, RQ1) compares short and long-context variants of ChatGPT and Vicuna. Vicuna-16k shows improved BAMBOO-4k performance but decreased MMLU performance compared to Vicuna.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "Using evidence and properly compressing long texts can generally enhance LLM performance.",
    302       "evidence": "Table 4 (RQ2) shows evidence-only input generally improves performance. Table 6 (RQ4) shows retrieval-augmented methods achieve comparable or superior performance to long-context LLMs on some tasks.",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "LLMs' performances on different datasets are sensitive to positions of instructions, with evidence at both ends being beneficial due to larger attention scores.",
    307       "evidence": "Table 5 (RQ3) shows varying performance across Pre-Ins, Post-Ins, and Both-Ins configurations. Figure 2 and Table 7 (RQ5) demonstrate U-shaped attention patterns and improved performance with evidence at text endpoints.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "BAMBOO avoids data contamination by using data sources released in 2023.",
    312       "evidence": "Section 3.1 describes the contamination avoidance strategy and Section 3.2 confirms data sources are from 2023. The paper acknowledges this is a mitigation, not a guarantee.",
    313       "supported": "moderate"
    314     }
    315   ],
    316   "methodology_tags": [
    317     "benchmark-eval"
    318   ],
    319   "key_findings": "BAMBOO evaluates five long-context LLMs on 10 datasets across 5 task types at two length levels (4k and 16k tokens), with data contamination avoidance as a key design principle. ChatGPT-16k consistently outperforms other models, while open-source models often fail to surpass random baselines on uncommon tasks like text sorting and code completion. The paper identifies that extending context windows can hurt short-text performance ('extension tax'), context compression via retrieval can match long-context models, and instruction position significantly affects performance due to U-shaped attention patterns.",
    320   "red_flags": [
    321     {
    322       "flag": "No statistical rigor",
    323       "detail": "All results are point estimates from apparently single runs with no confidence intervals, error bars, significance tests, or variance reporting. Comparative claims like 'consistently demonstrates optimal performance' are based solely on raw number comparisons."
    324     },
    325     {
    326       "flag": "Small sample sizes per dataset",
    327       "detail": "Individual datasets contain only 100-200 examples (Table 2), which combined with single-run evaluation and no uncertainty quantification makes the reliability of numerical differences unclear."
    328     },
    329     {
    330       "flag": "No limitations section",
    331       "detail": "The paper lacks any dedicated limitations or threats-to-validity section. The discussion section only addresses problems of LLMs, not limitations of the benchmark itself (e.g., limited model diversity, potential remaining contamination, small dataset sizes)."
    332     },
    333     {
    334       "flag": "Overgeneralized claims from limited model set",
    335       "detail": "Claims about 'LLMs' in general are based on only 5 models: 2 closed-source and 3 open-source 7B-parameter models. The open-source models are all relatively small, making claims about LLM capabilities overly broad."
    336     },
    337     {
    338       "flag": "Missing hyperparameters",
    339       "detail": "No generation hyperparameters (temperature, top-p, max tokens) are reported for any model, making results impossible to reproduce exactly and potentially sensitive to undisclosed settings."
    340     }
    341   ],
    342   "cited_papers": [
    343     {
    344       "title": "Evaluating large language models trained on code",
    345       "authors": ["Mark Chen", "Jerry Tworek"],
    346       "year": 2021,
    347       "arxiv_id": "2107.03374",
    348       "relevance": "Introduces HumanEval and Codex, foundational work on LLM code generation evaluation."
    349     },
    350     {
    351       "title": "Lost in the middle: How language models use long contexts",
    352       "authors": ["Nelson F. Liu", "Kevin Lin", "John Hewitt"],
    353       "year": 2023,
    354       "arxiv_id": "2307.03172",
    355       "relevance": "Key study on positional biases in LLM long-context processing, directly validated by this paper's RQ5."
    356     },
    357     {
    358       "title": "L-eval: Instituting standardized evaluation for long context language models",
    359       "authors": ["Chenxin An", "Shansan Gong", "Ming Zhong"],
    360       "year": 2023,
    361       "arxiv_id": "2307.11088",
    362       "relevance": "Competing long-context benchmark that BAMBOO compares against in Table 1."
    363     },
    364     {
    365       "title": "Longbench: A bilingual, multitask benchmark for long context understanding",
    366       "authors": ["Yushi Bai", "Xin Lv", "Jiajie Zhang"],
    367       "year": 2023,
    368       "arxiv_id": "2308.14508",
    369       "relevance": "Competing long-context benchmark discussed as lacking contamination avoidance."
    370     },
    371     {
    372       "title": "Zeroscrolls: A zero-shot benchmark for long text understanding",
    373       "authors": ["Uri Shaham", "Maor Ivgi", "Avia Efrat"],
    374       "year": 2023,
    375       "arxiv_id": "2305.14196",
    376       "relevance": "Competing long-context benchmark compared in Table 1, criticized for contamination risk."
    377     },
    378     {
    379       "title": "Time travel in llms: Tracing data contamination in large language models",
    380       "authors": ["Shahriar Golchin", "Mihai Surdeanu"],
    381       "year": 2023,
    382       "arxiv_id": "2308.08493",
    383       "relevance": "Foundational work on data contamination detection that motivates BAMBOO's contamination avoidance design."
    384     },
    385     {
    386       "title": "A survey of large language models",
    387       "authors": ["Wayne Xin Zhao", "Kun Zhou", "Junyi Li"],
    388       "year": 2023,
    389       "arxiv_id": "2303.18223",
    390       "relevance": "Comprehensive LLM survey that inspired BAMBOO's capacity evaluation framework."
    391     },
    392     {
    393       "title": "HaluEval: A large-scale hallucination evaluation benchmark for large language models",
    394       "authors": ["Junyi Li", "Xiaoxue Cheng", "Wayne Xin Zhao"],
    395       "year": 2023,
    396       "arxiv_id": "2305.11747",
    397       "relevance": "Hallucination evaluation benchmark used as basis for BAMBOO's hallucination detection task construction."
    398     },
    399     {
    400       "title": "Retrieval meets long context large language models",
    401       "authors": ["Peng Xu", "Wei Ping", "Xianchao Wu"],
    402       "year": 2023,
    403       "arxiv_id": "2310.03025",
    404       "relevance": "Study on retrieval augmentation vs. long context, directly related to BAMBOO's RQ4 findings."
    405     },
    406     {
    407       "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena",
    408       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    409       "year": 2023,
    410       "arxiv_id": "2306.05685",
    411       "relevance": "LLM evaluation methodology paper; Vicuna model used as a baseline in BAMBOO."
    412     },
    413     {
    414       "title": "Instruction position matters in sequence generation with large language models",
    415       "authors": ["Yijin Liu", "Xianfeng Zeng", "Fandong Meng"],
    416       "year": 2023,
    417       "arxiv_id": "2308.12097",
    418       "relevance": "Study on instruction positioning that directly motivates BAMBOO's RQ3 experiments on instruction position effects."
    419     }
    420   ]
    421 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs