scan.json (27218B)
1 { 2 "paper": { 3 "title": "RATHAN@DravidianLangTech 2025: Annaparavai - Separate the Authentic Human Reviews from AI-generated one", 4 "authors": [ 5 "Jubeerathan Thevakumar", 6 "Luheerathan Thevakumar" 7 ], 8 "year": 2025, 9 "venue": "Proceedings of the Fifth Workshop on Speech, Vision, and Language Technologies for Dravidian Languages (DravidianLangTech@NAACL 2025)", 10 "doi": "10.18653/v1/2025.dravidianlangtech-1.66" 11 }, 12 "scan_version": 3, 13 "active_modules": [ 14 "experimental_rigor", 15 "data_leakage" 16 ], 17 "methodology_tags": [ 18 "benchmark-eval" 19 ], 20 "key_findings": "A transfer learning approach using embeddings from XLM-RoBERTa, IndicBERT, mT5, and Sentence-BERT, combined via weighted-average DNN ensemble, achieves 90% F1 for Malayalam and 73% F1 for Tamil on a shared-task AI-generated review detection dataset. Cross-validation results (94–97% F1) did not transfer to the Tamil test set, suggesting distribution mismatch between training and test data. Individual DNN models on the training split achieved up to 98.2% F1, but ensemble generalization varied by language.", 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "GitHub repository URL provided in footnote 1: https://github.com/Jubeerathan/Annaparavai. The abstract and conclusion both state 'source code is publicly available.'" 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "The paper uses shared task datasets from Premjith et al. (2025), which are publicly distributed through the DravidianLangTech 2025 shared task." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "No requirements.txt, Dockerfile, conda environment, or library version listing is mentioned in the paper. No environment setup details beyond naming the models used." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README content or reproduction steps are described." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": true, 48 "justification": "Tables 1 and 2 report standard deviation alongside mean F1-scores for 5-fold cross-validation (e.g., Sentence-BERT: 0.962 ± 0.014 for Tamil). However, no uncertainty is reported for the DNN/ensemble results or the final test set evaluation." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "No statistical significance tests are reported. Comparisons between models and between ensemble vs. individual models are made purely by comparing point F1-scores." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": false, 58 "justification": "No effect sizes are reported. Results are given as absolute F1-scores without contextualized comparisons (e.g., no Cohen's d, no relative improvement over a baseline)." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "The datasets are small (808 Tamil training, 800 Malayalam training, 100/200 test) with no justification for these sizes. No power analysis or discussion of whether sample sizes are adequate for the claims made." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": true, 68 "justification": "Standard deviation is reported across 5-fold cross-validation runs in Tables 1 and 2. However, no variance is reported for the DNN training runs or the final test set evaluation (Table 3)." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": false, 75 "justification": "No external baselines are included. The paper only compares its own model variants (four individual DNNs and an ensemble). No comparison against other shared task participants, naive baselines (e.g., majority class, random), or prior work on the same dataset." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": false, 80 "justification": "No external baselines are included at all, so the question of whether they are contemporary is moot." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": false, 85 "justification": "The paper shows individual model results and the ensemble result (Table 3), but does not ablate the ensemble by systematically removing components (e.g., ensemble minus one model). The individual-vs-ensemble comparison does not constitute a controlled ablation." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": false, 90 "justification": "Only F1-score is used as the evaluation metric. Confusion matrices are shown (Figures 5–6) but precision, recall, and accuracy are not explicitly reported as separate metrics." 91 }, 92 "human_evaluation": { 93 "applies": false, 94 "answer": false, 95 "justification": "Human evaluation is not relevant to this binary classification task on a labeled dataset. Automated metrics are appropriate." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "The shared task provides a separate held-out test set (100 Tamil, 200 Malayalam samples). The paper also splits training data into 70/21/9 for development. Final results (73% Tamil, 90% Malayalam) are reported on the given test set." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Results are broken down by language (Tamil and Malayalam) and by embedding model (4 individual DNNs plus ensemble) in Tables 1–3. Confusion matrices show per-class performance." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": false, 110 "justification": "The paper notes the Tamil performance drop (Section 5) and speculates about distribution mismatch, but does not show or analyze specific failure examples or error categories." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Section 5 honestly reports the significant performance drop on Tamil test data (73% vs. ~96% on CV), and acknowledges that CV performance did not transfer. This is a negative finding the authors could have downplayed." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "The abstract claims '90% f1-score for Malayalam and 73% for Tamil' which are directly supported by the test set results. The claim about 'effectiveness of transfer learning and ensembling' is supported by the results, though without external baselines the strength of that evidence is limited." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": false, 127 "justification": "The paper claims to 'demonstrat[e] the effectiveness of transfer learning and ensembling for review detection' (abstract, conclusion). Without external baselines or non-transfer-learning comparisons, the causal claim that transfer learning is what makes this effective is not justified — a simpler approach might perform similarly." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": false, 132 "justification": "The abstract claims 'effectiveness of transfer learning and ensembling for review detection' broadly. The conclusion says 'demonstrating the effectiveness of transfer learning in low-resource Dravidian languages' — generalizing from two languages and one task to all 'low-resource Dravidian languages.' Results are from a single shared task dataset with specific characteristics." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": false, 137 "justification": "Section 5 briefly speculates that Tamil performance drops may be due to distribution mismatch, but does not systematically consider alternative explanations for overall results (e.g., dataset artifacts, trivial length/style differences between human and AI reviews, overfitting)." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper measures F1-score for binary classification (human vs. AI reviews) and claims performance on this detection task. The measurements match the granularity of the claims with no proxy gap." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": false, 149 "justification": "The paper names 'XLM-RoBERTa', 'Indic-BERT', 'mT5', and 'Sentence-BERT' without specifying exact model sizes, checkpoint versions, or Hugging Face model IDs. For example, XLM-RoBERTa comes in base and large variants, but the paper does not specify which was used." 150 }, 151 "prompts_provided": { 152 "applies": false, 153 "answer": false, 154 "justification": "No prompting is used. The pre-trained models are used solely as embedding extractors, not via prompting." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": false, 159 "justification": "No hyperparameters are reported for the DNN models (learning rate, batch size, epochs, optimizer, hidden layer sizes) or XGBoost (learning rate, max depth, n_estimators). The ensemble weights are also not disclosed. Figure 3 shows a DNN architecture diagram but without specific values." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "No agentic scaffolding is used. The system is a standard embedding-to-classifier pipeline." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 4.1 explicitly documents that no preprocessing was performed and explains why: the dataset consists of short, clean texts, and the language models handle tokenization internally. The data split ratios (70/21/9) are stated in Section 4.2." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 7 is titled 'Limitations' and contains a substantive paragraph discussing biases inherited from language models and dataset size constraints." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": false, 181 "justification": "The limitations in Section 7 are generic: 'biases inherited from the language models' and 'the dataset is limited.' These are not specific to this study — they apply to nearly any transfer-learning classification paper." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": false, 186 "justification": "The paper does not explicitly state what the results do NOT show or what settings are excluded. No specific boundaries like 'our results apply only to product reviews generated by [specific models]' or 'we do not claim generalization to other domains.'" 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": true, 193 "justification": "The datasets are from the DravidianLangTech 2025 shared task (Premjith et al., 2025) and are distributed through the shared task, making them available for independent verification." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": false, 198 "justification": "Section 3 describes dataset sizes and labels (808/100 Tamil, 800/200 Malayalam, annotated as Human/AI) but does not describe how the data was collected — what tools generated the AI reviews, what products were reviewed, or how human reviews were sourced. This is deferred to Premjith et al. (2025)." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "No human participants. The data is from a standard shared task benchmark." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": false, 208 "justification": "The pipeline from embeddings to classification is described at a high level (Section 4.2), but exact counts at each stage are missing. The 70/21/9 split is stated as percentages but not in absolute numbers, and no information about how the split was performed (stratified? random seed?) is given." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": false, 215 "justification": "No funding sources are mentioned anywhere in the paper. No acknowledgments section is present." 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Author affiliations are clearly listed: Jubeerathan Thevakumar at University of Moratuwa, Colombo, Sri Lanka; Luheerathan Thevakumar from Jaffna, Sri Lanka. Neither evaluates a product from their employer." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": false, 224 "answer": false, 225 "justification": "No funding is disclosed; this appears to be unfunded student/independent work." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests or financial interests statement is included in the paper." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": true, 236 "answer": false, 237 "justification": "The pre-trained models (XLM-RoBERTa, IndicBERT, mT5, Sentence-BERT) are used for embedding extraction on benchmark data, but their training data cutoff dates are not stated." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": true, 241 "answer": false, 242 "justification": "No discussion of whether the shared task test data could overlap with the pre-trained models' training data, or whether the training and test sets in the shared task have distributional overlap (which Section 5 hints at but does not formally analyze)." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": true, 246 "answer": false, 247 "justification": "No discussion of whether the benchmark data (AI-generated reviews) could have been seen by the pre-trained models during their training. The shared task data is presumably new, but this is not explicitly addressed." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in this study. It is a benchmark classification task." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants in this study." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in this study." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in this study." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in this study." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants in this study." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants in this study." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": false, 291 "justification": "No inference cost, latency, or API costs are reported. The paper does not mention wall-clock time for embedding generation or DNN training/inference." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "No GPU hours, hardware specifications, or total training time are stated." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "No mention of random seeds or seed sensitivity. DNN training results in Table 3 appear to be from a single run with no seed variation analysis." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": false, 308 "justification": "The number of DNN training runs is not stated. Cross-validation is 5-fold (stated), but it is unclear how many times the DNN models were trained for the results in Table 3." 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": false, 313 "justification": "No hyperparameter search is described. The DNN architecture is shown (Figure 3) but no search over architectures or hyperparameters is reported." 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": false, 318 "justification": "No explanation of how the final model configuration was selected. The paper does not describe whether the 9% validation split was used for model selection or early stopping." 319 }, 320 "multiple_comparison_correction": { 321 "applies": false, 322 "answer": false, 323 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": true, 327 "answer": false, 328 "justification": "The paper only compares its own model variants against each other. No acknowledgment that the lack of external baselines or independent evaluation could bias the assessment of effectiveness." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": false, 332 "answer": false, 333 "justification": "No external baselines with differing compute budgets are compared, so compute-performance tradeoff analysis is not applicable." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": false, 338 "justification": "No discussion of whether the shared task benchmark validly measures AI-generated review detection capability. No analysis of what linguistic features the benchmark actually tests or whether performance on this benchmark transfers to real-world review detection." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "No scaffolding is involved. The system is a standard embedding-to-classifier pipeline." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "No discussion of whether the pre-trained models' training data could include information from after the shared task data creation period." 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "No discussion of whether the features (embeddings) contain information that would not be available in a real deployment scenario." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "No discussion of whether training and test examples share structural similarities (e.g., same AI generation models, same product domains, same review templates)." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": false, 365 "justification": "No leakage detection or prevention method is used." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "The ensemble model achieves 90% F1-score for Malayalam and 73% for Tamil on the shared task test set.", 372 "evidence": "Section 5 reports these results with confusion matrices in Figures 5 and 6. The given test sets are 200 samples (Malayalam) and 100 samples (Tamil).", 373 "supported": "moderate" 374 }, 375 { 376 "claim": "The weighted ensemble model outperforms individual DNN models on the training split (98.2% Tamil, 94.0% Malayalam vs. best individual model 97.1% Tamil, 96.4% Malayalam).", 377 "evidence": "Table 3 shows F1-scores for all four individual DNN models and the ensemble on the 21% training split.", 378 "supported": "moderate" 379 }, 380 { 381 "claim": "Transfer learning using pre-trained multilingual embeddings is effective for AI-generated review detection in low-resource Dravidian languages.", 382 "evidence": "Cross-validation results in Tables 1–2 show high F1-scores (90–97%) using XGBoost on embeddings from all four models. However, no non-transfer-learning baselines are compared.", 383 "supported": "weak" 384 }, 385 { 386 "claim": "The Tamil performance drop on the test set is attributable to distribution differences between training and test sets, potentially generated by different LLM models.", 387 "evidence": "Section 5 notes this hypothesis but provides no empirical analysis (e.g., distribution comparison, embedding visualization) to support it.", 388 "supported": "unsupported" 389 } 390 ], 391 "red_flags": [ 392 { 393 "flag": "No external baselines", 394 "detail": "The paper only compares its own model variants (four individual DNNs and an ensemble). No naive baselines (majority class, random), no prior work comparisons, and no other shared task participants' results are included. Without external baselines, claims of 'effectiveness' are unsubstantiated." 395 }, 396 { 397 "flag": "Very small test sets", 398 "detail": "Tamil test set has only 100 samples and Malayalam has 200 samples. With such small test sets, F1-scores have high variance — a few misclassifications can swing results by several percentage points. No uncertainty quantification is provided for the final test results." 399 }, 400 { 401 "flag": "Missing hyperparameters", 402 "detail": "No DNN hyperparameters (learning rate, batch size, epochs, optimizer, layer sizes), XGBoost hyperparameters, or ensemble weights are disclosed, making reproduction impossible even with the code." 403 }, 404 { 405 "flag": "Accuracy/F1 conflation", 406 "detail": "The abstract reports '90% f1-score' but the conclusion reports '90% accuracy' for the same Malayalam result. These are different metrics, and the paper switches between them without explanation." 407 }, 408 { 409 "flag": "Large CV-to-test performance gap unexplained", 410 "detail": "Tamil cross-validation F1 was 94–97% but test set F1 dropped to 73% — a ~20+ point gap. The paper attributes this to 'differences in distributions' without empirical analysis. This suggests possible overfitting or fundamental distributional mismatch that undermines the main results." 411 } 412 ], 413 "cited_papers": [ 414 { 415 "title": "Automatic detection of generated text is easiest when humans are fooled", 416 "authors": ["Daphne Ippolito", "Daniel Duckworth", "Chris Callison-Burch", "Douglas Eck"], 417 "year": 2019, 418 "arxiv_id": "1911.00650", 419 "relevance": "Foundational work on detecting AI-generated text using BERT-based classifiers with various decoding strategies." 420 }, 421 { 422 "title": "TweepFake: About detecting deepfake tweets", 423 "authors": ["Tiziano Fagni", "Fabrizio Falchi", "Margherita Gambini", "Antonio Martella", "Maurizio Tesconi"], 424 "year": 2021, 425 "relevance": "Introduces sequence-based classifiers (LSTM, GRU, CNN) for detecting AI-generated social media texts." 426 }, 427 { 428 "title": "Release strategies and the social impacts of language models", 429 "authors": ["Irene Solaiman", "Miles Brundage", "Jack Clark"], 430 "year": 2019, 431 "arxiv_id": "1908.09203", 432 "relevance": "Uses RoBERTa-based classifiers to detect GPT-2-generated text, achieving ~95% accuracy." 433 }, 434 { 435 "title": "Stylometric detection of AI-generated text in Twitter timelines", 436 "authors": ["Tharindu Kumarage", "Joshua Garland", "Amrita Bhattacharjee"], 437 "year": 2023, 438 "arxiv_id": "2303.03697", 439 "relevance": "Combines stylometric features with pre-trained language models for AI-generated text detection." 440 }, 441 { 442 "title": "SeqXGPT: Sentence-level AI-generated text detection", 443 "authors": ["Pengyu Wang", "Linyang Li", "Ke Ren", "Botian Jiang", "Dong Zhang", "Xipeng Qiu"], 444 "year": 2023, 445 "arxiv_id": "2310.08903", 446 "relevance": "Uses sentence-level log probability metrics from white-box LLMs for AI-generated text detection." 447 }, 448 { 449 "title": "GPT-who: An information density-based machine-generated text detector", 450 "authors": ["Saranya Venkatraman", "Adaku Uchendu", "Dongwon Lee"], 451 "year": 2023, 452 "arxiv_id": "2310.06202", 453 "relevance": "Proposes UID-based features for distinguishing AI-generated text from human text." 454 }, 455 { 456 "title": "Neural deepfake detection with factual structure of text", 457 "authors": ["Wanjun Zhong", "Duyu Tang", "Zenan Xu"], 458 "year": 2020, 459 "arxiv_id": "2010.07475", 460 "relevance": "Combines factual structure features with RoBERTa-based classifier for AI-generated content detection." 461 }, 462 { 463 "title": "Classification of human- and AI-generated texts for different languages and domains", 464 "authors": ["Kristina Schaaff", "Tim Schlippe", "Lorenz Mindner"], 465 "year": 2024, 466 "relevance": "Multi-language AI-generated text classification using Sentence-BERT, directly relevant to the survey's scope of AI detection methods." 467 } 468 ], 469 "engagement_factors": { 470 "practical_relevance": { 471 "score": 1, 472 "justification": "The approach could be used for AI-generated review detection in Tamil/Malayalam, but the niche language scope and small shared-task context limit immediate broad applicability." 473 }, 474 "surprise_contrarian": { 475 "score": 0, 476 "justification": "Standard transfer learning approach with expected results — no surprising findings or challenges to conventional wisdom." 477 }, 478 "fear_safety": { 479 "score": 1, 480 "justification": "Addresses AI-generated fake reviews which is a known concern for online trust, but does not reveal novel attacks or systemic risks." 481 }, 482 "drama_conflict": { 483 "score": 0, 484 "justification": "No controversy, no challenging claims about other work or products." 485 }, 486 "demo_ability": { 487 "score": 1, 488 "justification": "Code is on GitHub, but requires specific shared-task datasets and no environment setup is provided." 489 }, 490 "brand_recognition": { 491 "score": 0, 492 "justification": "Workshop paper from University of Moratuwa researchers, no major lab or product involved." 493 } 494 } 495 }