scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27637B)
      1 {
      2   "paper": {
      3     "title": "Linguistics Theory Meets LLM: Code-Switched Text Generation via Equivalence Constrained Large Language Models",
      4     "authors": [
      5       "Garry Kuwanto",
      6       "Chaitanya Agarwal",
      7       "Genta Indra Winata",
      8       "Derry Tanti Wijaya"
      9     ],
     10     "year": 2024,
     11     "venue": "arXiv",
     12     "arxiv_id": "2410.22660",
     13     "doi": "10.48550/arXiv.2410.22660"
     14   },
     15   "scan_version": 2,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "EZSWITCH integrates Equivalence Constraint Theory (ECT) with LLMs (Llama3, Llama3.1, Aya23) to generate code-switched text, showing statistically significant fluency improvements over unconstrained baselines (p<.001) across Hindi-English, Tamil-English, and Malayalam-English. A comprehensive correlation study reveals existing automatic metrics (BLEU, COMET, BERTScore) correlate poorly with human judgments for code-switching (Kendall's tau ≤0.29), while GPT-4o-mini achieves ~0.5 correlation. Strong directional asymmetry was found: Indic-to-English code-switching significantly outperforms English-to-Indic (mean difference 0.47 accuracy, 0.40 fluency, p<.001).",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Code released at https://github.com/gkuwanto/ezswitch as stated in the abstract footnote."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "CSPREF human preference dataset released at https://huggingface.co/datasets/garrykuwanto/cspref. Input datasets (HinGE, Samanantar) are publicly available."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "Only hardware mentioned: 'single NVIDIA L40 GPU with 48GB of memory' (Section 3.1). No requirements.txt, Dockerfile, or library versions provided."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions found in the paper. The methodology is described but no runnable reproduction guide or README commands are mentioned."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Tables 2 and 3 report only point estimates (mean scores). No confidence intervals, error bars, or ± notation on any results."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "One-way ANOVAs conducted for Model, Method, and Direction factors (Table 6, all p<.001). Tukey's post-hoc tests with p-values reported for pairwise comparisons (Tables 7, 8, 9)."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Tukey's post-hoc tables (7, 8, 9) report mean differences alongside p-values, and Table 3 provides individual condition means, giving readers sufficient context to assess magnitude of effects (e.g., 'Llama3.1 vs. Aya23, Accuracy, 0.1881')."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification for why 150 samples were chosen for human evaluation, or why 2,000-2,766 parallel sentences were used per language pair. No power analysis."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No standard deviations or variance measures reported for generation results. Inter-annotator agreement is reported via Krippendorff's alpha (Table 11) but no variance across experimental runs."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Three methods compared: Baseline (unconstrained LLM), Human ECT (ECT with human translations), and EZSWITCH (ECT with LLM translations). Also compared with Word Replacement (WR) from Winata et al. (2019)."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Models used are contemporary (Llama3, Llama3.1, Aya23 — all 2024). WR baseline (Winata et al. 2019) is the standard prior method for ECT-based code-switching generation."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The three-method comparison effectively ablates key components: Baseline removes ECT constraints entirely; Human ECT vs EZSWITCH isolates the effect of translation source (human vs LLM). Statistical tests (Tables 7, 8) quantify each component's contribution."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Multiple metrics used: Human Accuracy, Human Fluency, COMET (l1, l2, avg), GPT4oa, GPT4of, BLEU, BERTScore variants (Table 4)."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Extensive human evaluation: 24,300 total evaluations by native bilingual speakers rating accuracy and fluency on a 1-3 scale, with 3 evaluators per sentence (Section 3.6)."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No description of dev/test split. The 150-sample subset for human evaluation is randomly sampled from the full dataset, but there is no indication of a separate development set used for prompt or method tuning."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results broken down by translation direction (English→CS vs Indic→CS), by model (Aya23, Llama3, Llama3.1), and by method (Baseline, Human ECT, EZSWITCH) in Tables 2 and 3."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 5.2 discusses where EZSWITCH underperforms baseline (from Indic input). Section 5.3 discusses directional asymmetry and potential annotator bias. Section 5.4 discusses metric failures."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 5.2 reports that from Indic input, 'human evaluations show that EZSWITCH and Human ECT perform similarly, but both underperform compared to the Baseline.' This is a clear negative result for the proposed method."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims 'significant improvement in quality' which is supported by ANOVA (F=20.80, p<.001 for Method on Fluency) and Tukey's tests showing significant fluency improvements over baseline in both directions (Table 8). The paper appropriately hedges about direction-dependent effects in the body."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper claims 'incorporating linguistic constraints into LLMs leads to more robust and human-aligned generation.' The controlled comparison between Baseline (no ECT) and ECT-constrained methods, with statistical tests, provides adequate evidence for this causal claim through systematic single-variable manipulation."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "Abstract claims results 'pav[e] the way for scalable code-switching text generation across diverse language pairs' but only three Indic-English pairs were tested. The title ('Code-Switched Text Generation') is broader than what was evaluated (three specific language pairs with 8B open-source models only)."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section 5.3 discusses annotator L2 bias as an alternative explanation for directional asymmetry. Section 5.2 discusses LLM familiarity with Indic-English code-switching from training data as an alternative explanation for baseline performance."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper measures accuracy (meaning preservation) and fluency (naturalness) of code-switched text directly via human ratings. Claims match the granularity of measurements — they discuss code-switching generation quality in terms of these two specific dimensions, not broader unsubstantiated framing."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Open-source models specified with sizes (Aya23 8B, Llama3 8B, Llama3.1 8B) but GPT-4o-mini — used as a key evaluation tool — is specified only by marketing name without a snapshot date or API version."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Table 10 (Appendix C) provides the actual prompt text for all methods: Translate, Baseline, EZSWITCH, and GPT Eval. Placeholders (e.g., <Input Sentence>) are clearly defined by the documented methodology."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "No mention of temperature, top-p, max tokens, or any sampling parameters for any of the LLMs (Llama3, Llama3.1, Aya23, GPT-4o-mini)."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. The system is a linear pipeline: translation → alignment → prompt-based generation."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 2.3 documents the full pipeline: obtaining translations (human or LLM, Section 2.3.1), bitext alignment with GIZA++ (Section 2.3.2), identification of valid switching points via ECT (Section 2.3.3 and Algorithm 1)."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "A dedicated 'Limitations' section is present, discussing scope restrictions to open-source models, plans for commercial model extension, and intent to include more languages."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The Limitations section contains generic statements: 'our study is intentionally limited in scope, focusing exclusively on the evaluation of open-source language models to ensure reproducibility.' No specific threats like sample size adequacy, annotator calibration concerns, or particular failure modes of ECT are discussed."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The Limitations section explicitly states the study is restricted to open-source models and three Indic-English language pairs, and that extension to commercial models and additional languages is future work."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "CSPREF dataset released on HuggingFace (https://huggingface.co/datasets/garrykuwanto/cspref) contains human preference annotations for verification."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 3.6 describes human evaluation setup in detail: 150 samples per language, 18 generation settings, 3 evaluators per sentence, 1-3 discrete scale, and what evaluators could see during annotation."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Ethics Statement describes recruitment: 'DeccanAI ethically recruits human evaluators in India, and evaluates them to determine their English and native language proficiency via their crowdsourcing platform.' Evaluators are contractually employed and trained on annotation guidelines."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The pipeline from parallel corpora → translation → alignment → switching points → generation → human evaluation is documented step by step in Section 2.3. Dataset sizes stated in Table 1."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No acknowledgments section or funding disclosure found anywhere in the paper."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations listed: Boston University, Deccan AI, Capital One, Monash University Indonesia. Note states 'The work was done outside the affiliation' for one author."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "Funding not disclosed so independence cannot be assessed. Notably, co-author Chaitanya Agarwal is from Deccan AI, which also provided the annotation services used for evaluation — a potential conflict that is not discussed."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial disclosure statement found in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No training data cutoff dates stated for any of the models (Llama3, Llama3.1, Aya23). These models could have been trained on the evaluation datasets."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of whether Llama3/Aya23 training data includes HinGE or Samanantar datasets, which are public and could be in pre-training corpora."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "HinGE (2021) and Samanantar (2022) are public datasets that predate Llama3 (2024) training. No discussion of whether the models may have seen these sentences during training, which could inflate generation quality."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No mention of pre-registration for the human evaluation study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": true,
    256         "answer": true,
    257         "justification": "Ethics Statement: 'All aspects of this research were reviewed and approved by the Institutional Review Board of our organization.'"
    258       },
    259       "demographics_reported": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "Only broad characterization: evaluators are native Indic language speakers, proficient in English, from 'major cities in India.' No structured demographics (age, gender, education, years of bilingual experience)."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": true,
    266         "answer": true,
    267         "justification": "Ethics Statement describes criteria: evaluators must be 'native speakers of the respective Indic languages they assess and are proficient in English. Their language proficiency is evaluated through custom online tests.'"
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "This is an annotation task, not an experimental study with treatment conditions. Evaluators all perform the same task (rating sentences), so randomization to conditions is not applicable."
    273       },
    274       "blinding_described": {
    275         "applies": true,
    276         "answer": false,
    277         "justification": "No description of whether evaluators were blinded to which method/model generated each sentence. Evaluators see the input and output but blinding to experimental condition is not mentioned."
    278       },
    279       "attrition_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No mention of evaluator attrition, dropout, or how many evaluators started versus completed the annotation task."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No mention of inference cost, tokens consumed, wall-clock time per generation, or total API costs despite running 18 generation configurations across thousands of sentences."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Hardware mentioned ('single NVIDIA L40 GPU with 48GB of memory') but no total compute budget, GPU hours, or runtime reported."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No mention of random seeds or seed sensitivity analysis for any generation experiments."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Not stated whether results are from a single generation run or averaged across multiple runs."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No hyperparameter search described. Prompts appear to be designed without systematic tuning, and no search budget is reported."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "All 18 configurations (3 methods × 3 models × 2 directions) are reported in Tables 2 and 3. No selective reporting of best results; all conditions presented."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": true,
    321         "justification": "Tukey's HSD post-hoc test is used for pairwise comparisons (Tables 7, 8, 9), which inherently controls for family-wise error rate across multiple comparisons."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "No acknowledgment that the authors are evaluating their own system (EZSWITCH) against baselines. No independent evaluation or discussion of author-evaluation bias."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": false,
    330         "answer": false,
    331         "justification": "All methods use the same models with different prompts. Compute differences between methods are negligible."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "Section 5.4 extensively discusses whether automatic metrics (BLEU, COMET, BERTScore) actually capture code-switching quality. Table 4 quantifies the gap between metrics and human judgment. The paper demonstrates that standard benchmarks fail to measure what they claim."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is involved. The pipeline is a direct prompting approach without agentic scaffolding."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "Not discussed. HinGE (2021) and Samanantar (2022) predate the training of Llama3/Aya23 (2024), so temporal leakage is a real concern."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Not discussed. The EZSWITCH method provides target-language words as constraints in the prompt, which could be seen as providing information that aids generation quality, but this is by design rather than leakage. The broader question of evaluation setup leaking information is not addressed."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "Not discussed. No analysis of whether the public datasets (HinGE, Samanantar) appeared in LLM training data, which would create non-independence between training and evaluation."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention method used. No membership inference, n-gram overlap analysis, or decontamination applied."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "ECT-constrained generation (EZSWITCH) produces significantly more fluent code-switched text than unconstrained baseline LLMs.",
    370       "evidence": "ANOVA: Method factor F=20.80, p<.001 for Fluency (Table 6). Tukey's tests show Human ECT and EZSWITCH both significantly outperform Baseline for Fluency in both directions (Table 8, p<.003 for all fluency comparisons).",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Existing automatic metrics (BLEU, COMET, BERTScore) correlate poorly with human judgments for code-switching evaluation.",
    375       "evidence": "Table 4: Kendall's tau correlations with human accuracy/fluency: COMET_avg (0.246/0.290), BLEU (0.229/0.201), BERTScore variants (0.064-0.204). All below 0.3.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "GPT-4o-mini as evaluator achieves better alignment with human judgments than traditional metrics for code-switching.",
    380       "evidence": "Table 4: GPT4oa achieves 0.558 correlation with human accuracy and GPT4of achieves 0.514 with human fluency, substantially higher than all other automatic metrics.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Translation direction significantly affects code-switching quality, with Indic-to-English substantially outperforming English-to-Indic.",
    385       "evidence": "Table 9: mean difference 0.4684 for Accuracy and 0.4037 for Fluency (both p<.001). ANOVA Direction factor has highest F-scores: F=323.13 for Accuracy, F=293.31 for Fluency (Table 6).",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Llama3.1 8B consistently achieves higher accuracy and fluency than other models for code-switching generation.",
    390       "evidence": "Tables 3, 7: Llama3.1 significantly outperforms Aya23 and Llama3 for English-to-Indic accuracy (p<.001). For Indic-to-English, both Llama3 and Llama3.1 outperform Aya23.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "EZSWITCH with LLM-generated translations performs comparably to Human ECT with gold translations.",
    395       "evidence": "Tukey's tests (Table 8) show no significant difference between Human ECT and EZSWITCH for either accuracy or fluency in both translation directions (p>0.69 for all comparisons).",
    396       "supported": "strong"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "Low inter-annotator agreement",
    402       "detail": "Krippendorff's alpha for Tamil Fluency is 0.321 and Accuracy is 0.445 (Table 11). Malayalam is similarly low (0.405/0.423). Only Hindi shows acceptable agreement (0.646/0.720). This undermines the reliability of the primary evaluation metric — human judgments — for two of three language pairs."
    403     },
    404     {
    405       "flag": "Very small effect sizes on a coarse scale",
    406       "detail": "Human ratings use a 1-3 discrete scale. Typical differences between methods are 0.05-0.19 points, meaning most improvements represent less than 10% of the scale range. The coarse granularity may be insufficient to capture meaningful quality differences."
    407     },
    408     {
    409       "flag": "No hyperparameters reported",
    410       "detail": "Temperature, top-p, and other sampling parameters are not reported for any model (Llama3, Llama3.1, Aya23, GPT-4o-mini). These settings significantly affect generation quality and reproducibility."
    411     },
    412     {
    413       "flag": "Potential conflict of interest not disclosed",
    414       "detail": "Co-author Chaitanya Agarwal is affiliated with Deccan AI, which also provided the human annotation services used for the primary evaluation. This financial relationship is not discussed as a potential conflict."
    415     },
    416     {
    417       "flag": "No contamination analysis",
    418       "detail": "HinGE (2021) and Samanantar (2022) are public datasets that likely appear in the training data of Llama3/Llama3.1 (2024). If models memorized these sentences, generation quality would be artificially inflated. This is not discussed."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "Are multilingual models effective in code-switching?",
    424       "authors": ["Genta Indra Winata", "Samuel Cahyawijaya", "Zihan Liu", "Zhaojiang Lin", "Andrea Madotto", "Pascale Fung"],
    425       "year": 2021,
    426       "relevance": "Evaluates LLM multilingual capabilities in code-switching contexts, directly relevant to understanding model limitations."
    427     },
    428     {
    429       "title": "Multilingual large language models are not (yet) code-switchers",
    430       "authors": ["Ruochen Zhang", "Samuel Cahyawijaya", "Jan Christian Blaise Cruz", "Genta Winata", "Alham Aji"],
    431       "year": 2023,
    432       "relevance": "Demonstrates current LLM limitations in code-switching generation, establishing the problem this paper addresses."
    433     },
    434     {
    435       "title": "The decades progress on code-switching research in NLP: A systematic survey on trends and challenges",
    436       "authors": ["Genta Winata", "Alham Fikri Aji", "Zheng Xin Yong", "Thamar Solorio"],
    437       "year": 2023,
    438       "relevance": "Comprehensive survey of computational code-switching research, relevant as a survey methodology example."
    439     },
    440     {
    441       "title": "Prompting multilingual large language models to generate code-mixed texts: The case of south east asian languages",
    442       "authors": ["Zheng Xin Yong", "Ruochen Zhang", "Jessica Forde"],
    443       "year": 2023,
    444       "relevance": "Explores LLM prompting for code-switching generation, a direct comparison method for evaluating prompt-based approaches."
    445     },
    446     {
    447       "title": "The Llama 3 herd of models",
    448       "authors": ["Abhimanyu Dubey"],
    449       "year": 2024,
    450       "arxiv_id": "2407.21783",
    451       "relevance": "Technical report for the primary model (Llama3 8B) used in the evaluation experiments."
    452     },
    453     {
    454       "title": "Aya 23: Open weight releases to further multilingual progress",
    455       "authors": ["Viraat Aryabumi"],
    456       "year": 2024,
    457       "arxiv_id": "2405.15032",
    458       "relevance": "Technical report for Aya23, one of three models evaluated for multilingual code-switching generation."
    459     },
    460     {
    461       "title": "COMET: A neural framework for MT evaluation",
    462       "authors": ["Ricardo Rei", "Craig Stewart", "Ana C Farinha", "Alon Lavie"],
    463       "year": 2020,
    464       "relevance": "Machine translation evaluation metric used in this study; correlation analysis reveals its limitations for code-switching evaluation."
    465     },
    466     {
    467       "title": "Code-mixed probes show how pre-trained models generalise on code-switched text",
    468       "authors": ["Frances Adriana Laureano De Leon", "Harish Tayyar Madabushi", "Mark Lee"],
    469       "year": 2024,
    470       "relevance": "Investigates how pre-trained models handle code-switched text, relevant to understanding LLM code-switching capabilities."
    471     }
    472   ]
    473 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs