ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (26144B)


      1 {
      2   "paper": {
      3     "title": "Lost in the Mix: Evaluating LLM Understanding of Code-Switched Text",
      4     "authors": ["Amr Mohamed", "Yang Zhang", "Michalis Vazirgiannis", "Guokan Shang"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2506.14012"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "A GitHub repository URL is provided: https://github.com/amr-mohamedd/Lost-in-the-Mix.git (Section 1, footnote 1). The paper states 'Code and data are publicly available.'"
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper states 'Code and data are publicly available' (Section 1) and uses publicly available benchmarks (Belebele, MMLU, XNLI). The generated code-switched variants are described as available via the GitHub repository."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Appendix A states 'All experiments were conducted using NVIDIA A100 (40GB VRAM) and A10 (24GB VRAM) GPU clusters.' Appendix D specifies training hyperparameters including mixed-precision BF16, 4096-token window, batch size of four. However, no requirements.txt or Dockerfile is mentioned; the hardware and key training parameters are specified but full software dependencies are not documented."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While a GitHub link is provided, the paper itself does not include step-by-step reproduction instructions, a 'Reproducing Results' section, or specific commands to run experiments. The methodology is described but operational reproduction steps are absent from the paper text."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results are reported as point estimates (e.g., accuracy values like 0.66, 0.70) without confidence intervals, error bars, or any uncertainty quantification."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper makes numerous comparative claims (e.g., 'performance declines,' 'improved comprehension') but no statistical significance tests (p-values, t-tests, etc.) are reported anywhere. Differences are stated as raw numerical comparisons."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports accuracy deltas (Delta Acc) throughout, showing both baseline and code-switched values (e.g., 'Llama 3B decreased from 0.54 to 0.43 on EN→DE (Delta = -0.11)'). This provides sufficient context to understand the magnitude of effects."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is provided for the sample sizes used. The 300-sample test sets for CSW approach evaluation (Section 3.4) and 100-sample comparisons (Appendix B) are stated without power analysis or rationale for why these sizes are adequate."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or spread measures are reported for any experiment. Results appear to be single-run evaluations with no information about variability across runs or seeds."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Monolingual English performance serves as the baseline throughout all experiments (Tables 1-5), allowing direct comparison of code-switched performance against the original benchmark performance."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The models evaluated include contemporary LLMs: LLaMA 3.1/3.2 (2024), Qwen 2.5 (2025), Mistral 7B v0.3 (2023), and ALLaM 7B (2024). The benchmarks (Belebele, MMLU, XNLI) are standard and widely used."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 6 presents ablation studies: Section 6.1 reverses language roles (English as embedded), Section 6.2 tests extreme multi-language CSW with different language combinations (Settings 1-3). Section 7 ablates mitigation strategies (prompting vs. fine-tuning)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Three metrics are formally defined in Section 3.5: accuracy, weighted average accuracy, and accuracy delta (Delta Acc). Results are also broken down by individual benchmark (Belebele, MMLU, XNLI) in Figure 2."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Appendix B.1 describes bilingual annotators conducting pairwise preference evaluations of CSW text quality between Claude and GPT-4o outputs (100 examples per language pair, Table 6), assessing which 'sounded more natural.'"
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The evaluation uses established benchmark test sets (Belebele, MMLU, XNLI) that are separate from any training data. For the fine-tuning experiment (Section 7.2), training data comes from TED Talk transcripts while evaluation is on the benchmark test sets."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down per model (8 models), per embedded language (4 languages), and per benchmark task (Belebele, MMLU, XNLI) as shown in Figure 2 and Tables 1-5."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 7.1 discusses cases where prompting was detrimental: 'Llama 70B, especially on EN→AR and EN→ZH, where performance fell by 13 and 17 points respectively.' Section 8 discusses where models 'struggle to process disrupted monolingual structures.'"
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Several negative results are reported: prompt-based mitigation was 'detrimental' for Llama and Mistral models (Section 7.1, Table 5), fine-tuning achieved only 'partial recovery' (Section 7.2), and H1 (degradation proportional to linguistic distance) was not fully supported by the data."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims (degradation when foreign tokens disrupt English, improvement when English embedded in other languages, mixed prompting results, fine-tuning as more stable path) are all supported by corresponding results in Tables 1-5 and Figure 3."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims such as 'embedding English tokens into other languages often improves comprehension' and 'code-switching introduces specific comprehension challenges.' These are based on pre-post comparisons without controlling for confounds like tokenization differences, token frequency effects, or benchmark translation quality. No causal identification strategy is used."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title and abstract frame results broadly as 'LLM Understanding of Code-Switched Text,' but the study tests only 8 models, 5 languages, 3 benchmarks, and 2 CSW methods (noun-token and ratio-token). The Limitations section acknowledges only CSW method constraints, not the narrow language/model scope relative to the broad framing."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not substantively discuss alternative explanations. The claim that degradation stems from 'structural vulnerability' rather than 'token-level unfamiliarity' (Section 8) is asserted without ruling out confounds like tokenization artifacts, training data distribution, or benchmark translation quality affecting results."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Model families and sizes are specified (e.g., 'LLaMA 3.2 Instruct (3B)', 'Qwen 2.5 Instruct (3B, 7B, 72B)', 'Mistral 7B Instruct (v0.3)'), but exact model version strings or snapshot dates are not provided. For the generation models, only 'Claude 3.5 Sonnet' and 'GPT-4o' are given without version identifiers or API snapshot dates."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full prompt texts are provided in the appendices: CSW generation prompts (Figures 4-6), evaluation prompt (Figure 7), mitigation prompts for each benchmark (Figures 8-10), and instruction tuning templates (Figures 11-15)."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Appendix D reports fine-tuning hyperparameters: learning rate 2e-6, linear decay, 5% warmup, BF16, 4096-token window, batch size 4, single epoch. However, inference hyperparameters (temperature, top-p) for the evaluated models are not reported."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The multi-step CSW generation pipeline is described in detail in Section 3.3: step 1 identifies placeholder positions, step 2 fills placeholders with target language words. The alignment-based pipeline components (AWESOME aligner, LaBSE embeddings, Stanza POS tagger) are specified."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The data preprocessing is documented: Section 3.3-3.4 describes the CSW generation pipeline. Section 7.2 documents the fine-tuning data preparation: selecting TED Talk sentences >70 words, yielding ~3,650 pairs per language, totaling ~14,600 training samples. The EleutherAI harness adaptation is mentioned in Section 4."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "A dedicated 'Limitations' section appears after Section 8 (Discussion and Conclusion), discussing constraints of the noun-token approach and the fixed 20% substitution rate."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "The Limitations section is brief and mostly generic: it mentions that 'more complex forms of code-switching may induce more severe performance degradation' and that the 20% ratio was fixed. These are methodological scope notes, not specific threats to validity of the reported findings (e.g., no discussion of CSW generation quality affecting results, possible benchmark translation artifacts, or tokenization confounds)."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The Limitations section mentions future work directions but does not explicitly state what the results do NOT show. There are no statements bounding the claims to the specific models, languages, or benchmarks tested. The paper does not say, e.g., 'our results do not generalize to other language families or proprietary models.'"
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The paper states 'Code and data are publicly available' with a GitHub link (Section 1). The underlying benchmarks (Belebele, MMLU, XNLI) are publicly available standard datasets."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The data collection/generation process is described in detail across Sections 3.2-3.4 and Appendices B-D: how CSW variants were generated, which approaches were compared, and how the fine-tuning dataset was constructed from TED Talk transcripts."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Appendix B.1 mentions 'bilingual annotators' who conducted preference evaluations, but provides no information about how they were recruited, their qualifications, number of annotators, or potential selection bias."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full pipeline from benchmark selection to CSW generation to evaluation is documented: benchmark selection (Section 4), CSW generation methods and selection (Section 3.3-3.4, Appendix B), evaluation setup using EleutherAI harness (Section 4), and fine-tuning data preparation (Section 7.2, Appendix D)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper. The authors are affiliated with MBZUAI and Ecole Polytechnique, but no funding sources are disclosed."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed on the first page: 1MBZUAI, 2Ecole Polytechnique."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed, so it is impossible to assess whether any funder has a stake in the outcome. The absence of any funding disclosure means this criterion cannot be satisfied."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement or financial disclosure is provided anywhere in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper evaluates multiple LLMs on established benchmarks (MMLU, Belebele, XNLI) but does not state the training data cutoff dates for any of the models evaluated."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of potential train/test overlap. MMLU (published 2020) and XNLI (published 2018) are well-known benchmarks that are very likely present in training data of models trained after their publication. This contamination risk is not addressed."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "MMLU (2020), XNLI (2018), and Belebele (2023) were all published before the training cutoffs of the models tested (LLaMA 3, Qwen 2.5, etc.). One could argue that the code-switched variants are novel, but the original English benchmarks are likely contaminated, and the paper does not discuss how this might affect the baseline comparisons or the measured accuracy deltas."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "The human evaluation was limited to pairwise preference judgments of CSW text quality by bilingual annotators (Appendix B.1), not a human subjects study requiring pre-registration."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "The paper involves annotator preference judgments, not a human subjects study with participants as research subjects. IRB approval is not typically required for this type of evaluation."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in the sense of a human subjects study. The bilingual annotators are evaluators of text quality, not research subjects."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants as research subjects. The annotators are part of the methodology, not the subject population."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "Not a human subjects experiment. No randomization of participants to conditions."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "Not a human subjects experiment requiring blinding of participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "Not a human subjects study with participant attrition."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference costs, API costs, or per-example costs are reported, despite using commercial APIs (Claude 3.5 Sonnet, GPT-4o) extensively for CSW generation and evaluation."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Appendix A states: 'The compute allocation totaled 22 GPU-days, comprising 8 GPU-days on 8xA100 nodes and 14 GPU-days on 4xA10 nodes.'"
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Embedding non-English tokens into English text consistently degrades LLM performance, even when switches follow linguistic constraints.",
    286       "evidence": "Tables 1 and 2 show consistent drops across all 8 models and 4 embedded languages for both noun-token and ratio-token CSW. E.g., LLaMA-70B drops from 0.70 to 0.66-0.67 (Section 5.1).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Embedding English tokens into non-English matrix languages often improves comprehension.",
    291       "evidence": "Table 3 (Section 6.1) shows improvements in most model-language combinations, e.g., Mistral 7B Arabic rising from 0.35 to 0.48 (Delta = +0.13). Only one case showed a minor drop (Qwen 7B on Chinese, Delta ~ -0.01).",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Code-switching complexity does not linearly correlate with performance degradation.",
    296       "evidence": "Table 4 (Section 6.2) shows that extreme CSW with 4 embedded languages (Setting 3) does not consistently yield the worst results across all models. Some models perform better with more languages than fewer.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Prompt-based mitigation yields inconsistent results across models.",
    301       "evidence": "Table 5 (Section 7.1) shows Qwen family models improve with prompting while Llama and Mistral models degrade significantly (e.g., Llama 70B drops by 13-17 points on some language pairs).",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Fine-tuning on code-switched data leads to more stable performance recovery than prompting.",
    306       "evidence": "Figure 3 and Section 7.2 show CSW-Llama 8B recovers +0.03 to +0.04 points across language pairs after fine-tuning. However, this is tested on only one model (Llama 8B) and recovery is partial.",
    307       "supported": "weak"
    308     },
    309     {
    310       "claim": "Degradation does not stem solely from token-level unfamiliarity but from a more fundamental processing difficulty.",
    311       "evidence": "Section 8 argues that similar degradation under both linguistically constrained (noun-token) and random (ratio-token) CSW suggests a structural vulnerability. However, this interpretation is not tested against alternative explanations like tokenization artifacts.",
    312       "supported": "weak"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval"],
    316   "key_findings": "The paper systematically evaluates 8 LLMs on code-switched versions of Belebele, MMLU, and XNLI benchmarks across 5 languages. Embedding non-English tokens into English consistently degrades performance (up to -0.11 accuracy), while embedding English into other languages often improves performance (up to +0.13), revealing an asymmetric linguistic vulnerability. Prompt-based mitigation helps some model families (Qwen) but harms others (Llama, Mistral), while fine-tuning on CSW data provides more consistent but partial recovery, tested only on Llama 8B.",
    317   "red_flags": [
    318     {
    319       "flag": "No statistical significance tests",
    320       "detail": "All comparative claims are based on raw accuracy differences without any significance testing. With single-run evaluations and no variance reporting, it is impossible to know whether observed differences (e.g., 1-2 percentage point changes) are meaningful or within noise."
    321     },
    322     {
    323       "flag": "Benchmark contamination risk unaddressed",
    324       "detail": "MMLU (2020) and XNLI (2018) are well-known benchmarks almost certainly present in the training data of the models tested (LLaMA 3, Qwen 2.5, etc.). The baseline English accuracy may reflect memorization rather than comprehension, which would confound the interpretation of accuracy deltas under code-switching."
    325     },
    326     {
    327       "flag": "Fine-tuning claim based on single model",
    328       "detail": "The claim that fine-tuning is a 'more stable path to degradation mitigation' is based on experiments with only Llama 8B. Generalizing from one model to a broad conclusion about mitigation strategies is unsupported."
    329     },
    330     {
    331       "flag": "No inference temperature or sampling parameters reported",
    332       "detail": "While fine-tuning hyperparameters are specified, the inference settings (temperature, top-p, etc.) for all 8 evaluated models are not reported. These settings significantly affect LLM outputs and reproducibility."
    333     },
    334     {
    335       "flag": "Annotator details missing",
    336       "detail": "The human evaluation (bilingual annotators, Appendix B.1) provides no information about the number of annotators, their qualifications, inter-annotator agreement, or recruitment process."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "Multilingual large language models are not (yet) code-switchers",
    342       "authors": ["Ruochen Zhang", "Samuel Cahyawijaya", "Jan Christian Blaise Cruz", "Genta Winata", "Alham Fikri Aji"],
    343       "year": 2023,
    344       "relevance": "Directly relevant study evaluating LLM code-switching capabilities, establishing the baseline that this paper extends."
    345     },
    346     {
    347       "title": "Judging LLM-as-a-judge with MT-bench and chatbot arena",
    348       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    349       "year": 2023,
    350       "relevance": "Foundational paper for the LLM-as-a-judge evaluation methodology used in this paper for CSW quality assessment."
    351     },
    352     {
    353       "title": "The llama 3 herd of models",
    354       "authors": ["Aaron Grattafiori", "Abhimanyu Dubey"],
    355       "year": 2024,
    356       "arxiv_id": "2407.21783",
    357       "relevance": "Model paper for LLaMA 3 family used as primary evaluation targets in the study."
    358     },
    359     {
    360       "title": "Qwen3 technical report",
    361       "authors": ["An Yang", "Anfeng Li", "Baosong Yang"],
    362       "year": 2025,
    363       "arxiv_id": "2505.09388",
    364       "relevance": "Model paper for Qwen family used as evaluation targets; Qwen showed distinctive behavior under prompt-based mitigation."
    365     },
    366     {
    367       "title": "Measuring massive multitask language understanding",
    368       "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart"],
    369       "year": 2020,
    370       "arxiv_id": "2009.03300",
    371       "relevance": "MMLU benchmark paper; one of the three core benchmarks used for evaluating code-switching impact on LLM comprehension."
    372     },
    373     {
    374       "title": "A survey of large language models",
    375       "authors": ["Wayne Xin Zhao", "Kun Zhou", "Junyi Li"],
    376       "year": 2023,
    377       "arxiv_id": "2303.18223",
    378       "relevance": "Comprehensive LLM survey providing context for the capabilities and limitations of models evaluated in this study."
    379     },
    380     {
    381       "title": "The language model evaluation harness",
    382       "authors": ["Leo Gao", "Jonathan Tow", "Baber Abbasi"],
    383       "year": 2024,
    384       "relevance": "Evaluation infrastructure (EleutherAI harness) adapted for the code-switched benchmark evaluations in this study."
    385     },
    386     {
    387       "title": "Linguistics theory meets LLM: Code-switched text generation via equivalence constrained large language models",
    388       "authors": ["Garry Kuwanto", "Chaitanya Agarwal", "Genta Indra Winata", "Derry Tanti Wijaya"],
    389       "year": 2024,
    390       "arxiv_id": "2410.22660",
    391       "relevance": "Prior work on linguistically constrained LLM-based code-switched text generation that this paper builds upon."
    392     },
    393     {
    394       "title": "LLM-based code-switched text generation for grammatical error correction",
    395       "authors": ["Tom Potter", "Zheng Yuan"],
    396       "year": 2024,
    397       "relevance": "Prior work on LLM-based CSW generation approaches that motivated the generation methodology in this study."
    398     },
    399     {
    400       "title": "Beyond metrics: evaluating LLMs' effectiveness in culturally nuanced, low-resource real-world scenarios",
    401       "authors": ["Millicent Ochieng", "Varun Gumma", "Sunayana Sitaram"],
    402       "year": 2024,
    403       "arxiv_id": "2406.00343",
    404       "relevance": "Related work on evaluating LLM performance in multilingual and culturally diverse settings."
    405     },
    406     {
    407       "title": "Allam: Large language models for arabic and english",
    408       "authors": ["M Saiful Bari", "Yazeed Alnumay"],
    409       "year": 2024,
    410       "arxiv_id": "2407.15390",
    411       "relevance": "Model paper for ALLaM, the Arabic-specialized LLM used as one of the evaluation targets in this study."
    412     }
    413   ]
    414 }

Impressum · Datenschutz