scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30941B)
      1 {
      2   "paper": {
      3     "title": "Rethinking Benchmark and Contamination for Language Models with Rephrased Samples",
      4     "authors": [
      5       "Shuo Yang",
      6       "Wei-Lin Chiang",
      7       "Lianmin Zheng",
      8       "Joseph E. Gonzalez",
      9       "Ion Stoica"
     10     ],
     11     "year": 2023,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2311.04850",
     14     "doi": "10.48550/arXiv.2311.04850"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "Simple rephrasing or translation of benchmark test cases bypasses standard decontamination methods (n-gram overlap, embedding similarity) while enabling a 13B model to match GPT-4 performance on MMLU, HumanEval, and GSM-8k. The proposed LLM-based decontaminator, which combines embedding similarity search with GPT-4 judgment, achieves significantly higher detection F1 scores. Applied to real-world pre-training and fine-tuning datasets (RedPajama, StarCoder-Data, CodeAlpaca, MATH, FLAN), the method reveals 0.5-18.9% previously unknown benchmark overlap, including unintentional contamination in GPT-generated synthetic datasets.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper provides a public GitHub repository: https://github.com/lm-sys/llm-decontaminator, stated in the abstract and conclusion."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "All benchmarks used (MMLU, HumanEval, GSM-8k) are publicly available, and the training datasets analyzed (RedPajama, CodeAlpaca, StarCoder-Data, The Stack, FLAN, etc.) are all public. The decontamination tool is also released."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed dependency listing is provided in the paper. Only model names are mentioned without environment setup details."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions are provided in the paper. The algorithms are described (Algorithms 1-2) but there are no commands or scripts to replicate the experiments."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "All results in Tables 2-7 are point estimates with no confidence intervals, error bars, or uncertainty measures."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper compares detection F1 scores and benchmark accuracies across methods without any statistical significance tests. Claims like 'the LLM decontaminator works significantly better' are based solely on comparing numbers."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Baseline and post-contamination scores are reported together (e.g., Llama-2-13B from 54.8 to 89.9 on MMLU, CodeLlama-13B from 36.0 to 81.1 on HumanEval), providing sufficient context to assess effect magnitude."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No justification is given for the detection evaluation benchmark size (200 prompt pairs: 100 random + 100 rephrased). The choice of 3 MMLU subjects out of 57 for detection evaluation is not justified."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No variance, standard deviation, or spread measures are reported for any experiment. All results appear to be from single runs."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Detection methods are compared: n-gram overlap, embedding similarity search (with multiple embedding models), and the proposed LLM decontaminator, with random detection as baseline (Tables 5-6). For contamination effects, original model scores serve as baselines."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Baselines include contemporary models (Llama-2, CodeLlama, 2023) and GPT-4 as a reference point. Detection baselines use current embedding models (sentence-BERT variants). All baselines are from 2019-2023."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Multiple rephrasing variants are tested: English vs. Chinese translation (Table 2), question-only vs. full prompt (Table 2), single-language vs. multi-language code translation (Table 3). These systematically vary components of the rephrasing approach."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Multiple metrics are used: accuracy for MMLU and GSM-8k, Pass@1 for HumanEval, and F1 score for detection method evaluation."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No human evaluation is included. All detection accuracy and benchmark scoring is automated. Human judgment could have validated whether the LLM decontaminator's decisions align with human assessment of contamination."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Standard benchmark test sets (MMLU, HumanEval, GSM-8k) are used for evaluation. The training data consists of rephrased versions of the test data, which is the intended experimental design — the test sets themselves are held out from the exact training data."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Detection F1 scores are broken down by MMLU subject (Abstract Algebra, Sociology, US History) in Table 5 and by programming language (Python, C, JavaScript) in Table 6. Table 7 breaks down contamination by dataset."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Specific failure cases are shown: Figure 1 demonstrates an n-gram overlap failure on MMLU, Example 2 shows a false positive from n-gram overlap on multiple-choice questions, and Table 5 reveals embedding similarity search failures on translated samples."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Several negative results are reported: multilingual BERT fails on US History (F1=0.111 in Table 5), single-language C translation underperforms multi-language for HumanEval (45.7 vs 59.8 in Table 3), and multi-QA BERT completely fails on translated samples (F1=0 for Chinese in US History)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Abstract claims are supported: 'a 13B model can easily overfit a test benchmark and achieve drastically high performance, on par with GPT-4' is confirmed by Tables 2-4. '8-18% of HumanEval benchmark overlaps' is confirmed by Table 7. 'n-gram overlap insufficient' is confirmed by Tables 5-6."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The central causal claim — training on rephrased test data causes benchmark score inflation — is supported by controlled experiments where the only manipulation is the inclusion of rephrased data in fine-tuning. The before/after comparison with the same model isolates the effect of rephrased data."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title 'Rethinking Benchmark and Contamination for Language Models' generalizes beyond the tested scope. Results are from two model families (Llama-2, CodeLlama) on three benchmarks, but the paper urges 'the community' to act broadly. No explicit statement of what settings were NOT tested or where findings might not hold."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper does not discuss alternative explanations for its main findings. For instance, whether fine-tuning on rephrased test data could impart legitimate problem-solving skills rather than pure memorization is not addressed. Section 6.1 raises the number-substituted case as an open question but does not discuss alternative interpretations of the main results."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper's measurements (benchmark accuracy, F1 for detection) directly match its claims about contamination effects and detection accuracy. The paper explicitly argues that high benchmark scores from rephrased training do not reflect real capability, properly distinguishing the measured proxy from the intended outcome."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "Llama-2-7b, Llama-2-13b, CodeLlama 7B, and CodeLlama 13B are named with sizes, but GPT-4 — used as both the rephrasing engine and the LLM decontaminator — has no version or snapshot date specified. GPT-4 behavior varies across versions, making this a significant omission."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Appendix A provides full actual prompt text for MMLU rephrase/translate instructions and HumanEval rephrase/translate instructions. The decontaminator tool code is released at the GitHub repository, which would contain the detection prompt template."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "Only partial hyperparameters are reported: 16 epochs for fine-tuning, 'non-zero initial temperature' for rephrasing (without stating the value), k=1 for some decontaminator runs, and embedding thresholds (0.5, 0.6, 0.9). Missing: GPT-4 temperature/sampling settings, learning rate, batch size, optimizer, max tokens."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. The approach is a two-step pipeline (embedding similarity search + LLM judgment), not an agentic system."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The rephrasing algorithm (Algorithm 1) and detection algorithm (Algorithm 2) are documented, but fine-tuning data preparation is underdocumented. For real-world dataset analysis, sampling procedures are vaguely described ('we sample 16G of data from the GitHub subset') without detailing the sampling method or filtering steps."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "There is no dedicated limitations section. Section 6 ('Discussion') covers related topics including contamination definitions and future directions but does not systematically discuss limitations of the proposed method or experiments."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No specific threats to validity are discussed. The paper does not address potential issues such as the LLM decontaminator's dependence on GPT-4 quality, sensitivity to the embedding similarity threshold k, or whether the detection accuracy generalizes to other domains."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No explicit scope boundaries are stated. The paper does not state what models, benchmarks, or domains were NOT tested, or where the LLM decontaminator might fail. Section 6.3 acknowledges 'how to detect contamination without access to training data remains an open problem' but does not bound the current study's scope."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The rephrased test sets, fine-tuned model weights, and raw detection results are not released for independent verification. Only the decontamination tool is released. Original benchmarks and training datasets are public but the specific experimental artifacts are not."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Data collection is described: rephrased samples are generated using Algorithm 1 with GPT-4, detection evaluation uses 200 prompt pairs (100 random + 100 rephrased) per subject/benchmark, and real-world datasets are identified by name with subset sizes (e.g., RedPajama 16G GitHub subset)."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. All data comes from standard public benchmarks and publicly available training datasets."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "The rephrasing and detection pipelines are algorithmically described (Algorithms 1-2), but the fine-tuning pipeline lacks detail (only '16 epochs' stated). For real-world contamination analysis, the number of examples at each filtering stage is not reported — only final contamination counts appear in Table 7."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Acknowledgments section lists gifts from Anyscale, Astronomer, Google, IBM, Intel, Lacework, Microsoft, MBZUAI, Samsung SDS, Uber, and VMware. Lianmin Zheng's Meta PhD Fellowship is also disclosed."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly stated: UC Berkeley and Shanghai Jiao Tong University. The authors are not employed by the companies whose models or datasets are evaluated."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "The funding appears to be general research support from multiple tech companies, not tied to specific outcomes. The paper evaluates open-source datasets and models, not products of the funders. No funder has a direct stake in the contamination detection results."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "The training data cutoff dates for Llama-2 and CodeLlama are not stated in this paper, despite these models being fine-tuned and evaluated on benchmarks that predate their training."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "The entire paper is about train/test overlap. It systematically studies how rephrased test data in training sets affects benchmark scores and proposes detection methods."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": true,
    246         "justification": "Benchmark contamination is the central topic. The paper demonstrates contamination effects on MMLU, HumanEval, and GSM-8k, proposes a stronger detection method, and reveals contamination in real-world datasets (Section 5.3)."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "Table 1 lists computational complexity classes (O(MN), O(N)) for detection methods but no actual costs (API dollars, wall-clock time, tokens consumed) are reported for any experiment."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No GPU hours, total API spend, or training time is reported. The fine-tuning of Llama-2/CodeLlama models and the GPT-4 API calls for rephrasing and detection would involve significant compute, but this is not quantified."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No mention of multiple random seeds. All results appear to be from single runs without seed sensitivity analysis."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The number of experimental runs is never stated. It is unclear whether results are from single runs or averaged."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "No hyperparameter search is described. The fine-tuning configuration (16 epochs) appears chosen without justification, and no search budget is reported."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "The paper does not explain how configurations were selected. Embedding similarity thresholds (0.5, 0.6, 0.9) and n-gram sizes (10-gram, 50-character) are used without justification for these specific values."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "Multiple comparisons are made across detection methods, benchmarks, subjects, and languages without any correction for multiple testing. No formal statistical tests are performed at all."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors implement all baseline detection methods themselves and compare against their proposed LLM decontaminator. No acknowledgment of potential bias from implementing baselines or possible suboptimal baseline tuning."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "Table 1 shows theoretical complexity classes but no empirical comparison of compute budget vs. detection performance. The LLM decontaminator uses expensive GPT-4 calls while baselines are much cheaper, but this cost-performance tradeoff is not analyzed."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": true,
    337         "justification": "The entire paper questions benchmark construct validity by demonstrating that high scores on MMLU, HumanEval, and GSM-8k can be achieved through contamination rather than genuine capability. Section 6.3 advocates for fresh one-time exams to address this validity problem."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No scaffolding is used in these experiments. Models are directly fine-tuned and evaluated on benchmarks."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "The paper does not discuss the temporal relationship between benchmark creation dates (MMLU 2020, HumanEval 2021, GSM-8k 2021) and the training data cutoffs of Llama-2/CodeLlama, despite this being relevant to assessing pre-existing contamination in the base models."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "Feature leakage is not discussed. For example, whether the rephrasing process preserves structural features that could serve as hints is not analyzed."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "The independence between rephrased training samples and original test samples is not formally analyzed. By design, rephrased samples are semantically identical to test samples, but the paper does not discuss what degree of semantic overlap constitutes non-independence in the general case."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": true,
    364         "justification": "The paper proposes and applies concrete leakage detection methods: n-gram overlap, embedding similarity search, and the novel LLM decontaminator (Algorithm 2). These are applied to real-world datasets to detect contamination."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "Existing decontamination methods (n-gram overlap) fail to detect rephrased test samples that preserve the same semantics.",
    371       "evidence": "Tables 5-6 show F1 scores of 0 for 10-gram overlap on all rephrased (English and Chinese) MMLU samples and rephrased HumanEval samples across Python, C, and JavaScript. Figure 1 provides a concrete failure example.",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "A 13B model fine-tuned on rephrased benchmark data achieves performance on par with GPT-4.",
    376       "evidence": "Table 2: Llama-2-13B reaches 85.9-89.9 on MMLU (GPT-4 ~86.4). Table 3: CodeLlama-13B reaches 81.1 on HumanEval (GPT-4 67.0). Table 4: Llama-2-13B reaches 95.3 on GSM-8k.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "The proposed LLM decontaminator significantly outperforms existing detection methods for finding rephrased samples.",
    381       "evidence": "Tables 5-6 show the LLM decontaminator achieves minimum F1 of 0.940 across all MMLU conditions and 0.974 across all HumanEval conditions, while n-gram overlap scores 0 on all rephrased samples and embedding similarity varies widely (0 to 0.990).",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "8-18% of HumanEval benchmark overlaps with popular pre-training datasets.",
    386       "evidence": "Table 7 and Figure 3: The Stack (18.9%), StarCoder-Data (15.9%), CodeAlpaca (12.8%), RedPajama (8.5%). Specific examples shown in Examples 3-4 and Appendix B.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Synthetic data generated by LLMs contains unintentional contamination from benchmarks.",
    391       "evidence": "Section 5.3: CodeAlpaca (generated by GPT-3.5 Davinci-003) contains 21 rephrased HumanEval samples (12.8%). Example 3 shows a concrete instance. Phi-1 report is also cited as corroborating evidence.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Multi-language data augmentation for code yields better contamination results than single-language translation.",
    396       "evidence": "Table 3: CodeLlama-13B achieves 67.1 on multi-language vs. 48.2 on C-only translation. CodeLlama-7B achieves 59.8 vs. 45.7.",
    397       "supported": "moderate"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "No uncertainty quantification",
    403       "detail": "All benchmark scores and detection F1 scores are single point estimates with no error bars, confidence intervals, or multi-run variance. The stability of the fine-tuning results (e.g., whether 85.9 MMLU is consistent across runs) is unknown."
    404     },
    405     {
    406       "flag": "GPT-4 used as both creator and judge",
    407       "detail": "GPT-4 is used to generate rephrased samples (Algorithm 1) and also serves as the LLM decontaminator (Algorithm 2). This circular dependency could inflate detection accuracy — the detector may recognize its own rephrasing style more easily than other forms of contamination."
    408     },
    409     {
    410       "flag": "No limitations section",
    411       "detail": "The paper lacks a dedicated discussion of its own limitations. Key unaddressed issues include: sensitivity to the embedding threshold k, scalability to larger datasets, false positive rates of the LLM decontaminator in real-world settings, and whether findings generalize beyond the three benchmarks tested."
    412     },
    413     {
    414       "flag": "Incomplete training details",
    415       "detail": "Fine-tuning experiments report only '16 epochs' — no learning rate, optimizer, batch size, warmup schedule, or data formatting details. This makes reproduction of the contamination effect experiments difficult."
    416     }
    417   ],
    418   "cited_papers": [
    419     {
    420       "title": "Evaluating large language models trained on code",
    421       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    422       "year": 2021,
    423       "arxiv_id": "2107.03374",
    424       "relevance": "Introduces HumanEval benchmark, central to the contamination analysis in this paper and widely used for code generation evaluation."
    425     },
    426     {
    427       "title": "Measuring massive multitask language understanding",
    428       "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart"],
    429       "year": 2020,
    430       "arxiv_id": "2009.03300",
    431       "relevance": "Introduces MMLU benchmark, one of the three main benchmarks used to demonstrate contamination via rephrased samples."
    432     },
    433     {
    434       "title": "Training verifiers to solve math word problems",
    435       "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian"],
    436       "year": 2021,
    437       "arxiv_id": "2110.14168",
    438       "relevance": "Introduces GSM-8k benchmark used in this paper's contamination experiments."
    439     },
    440     {
    441       "title": "Llama 2: Open foundation and fine-tuned chat models",
    442       "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"],
    443       "year": 2023,
    444       "relevance": "Primary model family used in contamination experiments; also cited for its own contamination analysis showing 10% of MMLU test samples highly contaminated."
    445     },
    446     {
    447       "title": "GPT-4 technical report",
    448       "authors": ["OpenAI"],
    449       "year": 2023,
    450       "relevance": "Reports 25% HumanEval contamination in GPT-4 training data; uses 50-character overlap detection that this paper shows is insufficient."
    451     },
    452     {
    453       "title": "PaLM 2 technical report",
    454       "authors": ["Rohan Anil", "Andrew M. Dai", "Orhan Firat"],
    455       "year": 2023,
    456       "relevance": "Major LLM development that uses n-gram overlap for decontamination, which this paper shows is inadequate."
    457     },
    458     {
    459       "title": "Textbooks are all you need",
    460       "authors": ["Suriya Gunasekar", "Yi Zhang", "Jyoti Aneja"],
    461       "year": 2023,
    462       "relevance": "Phi-1 report that discovered significant contamination from LLM-generated synthetic data, corroborating this paper's findings about synthetic data contamination."
    463     },
    464     {
    465       "title": "Proving test set contamination in black box language models",
    466       "authors": ["Yonatan Oren", "Nicole Meister", "Niladri Chatterji"],
    467       "year": 2023,
    468       "relevance": "Proposes exchange detection for contamination in black-box models; alternative contamination detection method discussed in related work."
    469     },
    470     {
    471       "title": "Detecting pretraining data from large language models",
    472       "authors": ["Weijia Shi", "Anirudh Ajith", "Mengzhou Xia"],
    473       "year": 2023,
    474       "relevance": "Proposes min-k prob detection for LLM contamination without training data access; complementary contamination detection approach."
    475     },
    476     {
    477       "title": "StarCoder: may the source be with you!",
    478       "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi"],
    479       "year": 2023,
    480       "relevance": "Code pre-training dataset analyzed in this paper, found to have 15.9% HumanEval contamination."
    481     },
    482     {
    483       "title": "Quantifying memorization across neural language models",
    484       "authors": ["Nicholas Carlini", "Daphne Ippolito", "Matthew Jagielski"],
    485       "year": 2023,
    486       "relevance": "Studies LLM memorization and the boundary between generalization and memorization, directly relevant to understanding contamination effects."
    487     },
    488     {
    489       "title": "A survey on evaluation of large language models",
    490       "authors": ["Yupeng Chang", "Xu Wang", "Jindong Wang"],
    491       "year": 2023,
    492       "arxiv_id": "2307.03109",
    493       "relevance": "Surveys LLM evaluation challenges including benchmark trustworthiness concerns that motivate this contamination study."
    494     }
    495   ],
    496   "engagement_factors": {
    497     "practical_relevance": {
    498       "score": 3,
    499       "justification": "Released an immediately usable open-source decontamination tool at GitHub that practitioners can apply to their own datasets."
    500     },
    501     "surprise_contrarian": {
    502       "score": 2,
    503       "justification": "Quantifies a suspected but under-demonstrated problem: that simple rephrasing defeats standard decontamination, enabling a 13B model to match GPT-4."
    504     },
    505     "fear_safety": {
    506       "score": 1,
    507       "justification": "Raises concerns about benchmark trustworthiness and potential for gaming but does not demonstrate novel attacks or existential risks."
    508     },
    509     "drama_conflict": {
    510       "score": 2,
    511       "justification": "The 'a 13B model matches GPT-4 via contamination' finding and revelation of contamination in popular datasets like RedPajama has a 'benchmarks are unreliable' angle."
    512     },
    513     "demo_ability": {
    514       "score": 2,
    515       "justification": "Open-source GitHub tool available for download and use, though not a live web demo."
    516     },
    517     "brand_recognition": {
    518       "score": 2,
    519       "justification": "From UC Berkeley / LMSYS group (known for Chatbot Arena); uses well-known models (Llama-2, GPT-4) and datasets."
    520     }
    521   }
    522 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs