scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25886B)
      1 {
      2   "paper": {
      3     "title": "Empowering Low-Resource Languages: TraSe Architecture for Enhanced Retrieval-Augmented Generation in Bangla",
      4     "authors": [
      5       "Atia Shahnaz Ipa",
      6       "Mohammad Abu Tareq Rony",
      7       "Mohammad Shariful Islam"
      8     ],
      9     "year": 2025,
     10     "venue": "LM4UC 2025 (Workshop on Language Models for Underserved Communities, ACL 2025)",
     11     "doi": "10.18653/v1/2025.lm4uc-1.2"
     12   },
     13   "scan_version": 3,
     14   "active_modules": ["experimental_rigor", "data_leakage"],
     15   "methodology_tags": ["benchmark-eval"],
     16   "key_findings": "The TraSe architecture combines translative prompting (translate Bangla query to English, generate answer, translate back) with a selection mechanism that picks the best answer from two prompting strategies. On a 200-question Bangla QA dataset from Wikipedia, TraSe achieves 34% accuracy with automatic retrieval (BanglaBERT embeddings) and 63% with human-in-the-loop retrieval, outperforming zero-shot (22%/51%), 2-shot, Self-Ask, and ReAct baselines. Translative prompting is particularly effective for text-based (vs. number-based) answers.",
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "A GitHub repository is provided in the abstract: https://github.com/Atia6/TraSe-Bangla-RAG."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper states they created 200 QA pairs from Bangla Wikipedia dump but only explicitly mentions code availability. The dataset release is not explicitly confirmed; only 'The code is available' is stated."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper mentions Llama 2 7B, transformers library, LangChain, and bfloat16, but provides no requirements.txt, Dockerfile, or detailed environment setup with library versions."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README content or reproduction steps are described."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "All results in Table 3 and Figure 4 are reported as point estimates with no confidence intervals or error bars."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper claims TraSe 'outperforms' baselines and shows 'significant improvements' but provides no statistical significance tests. Comparisons are based solely on numerical differences."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Table 3 reports absolute accuracy and F1 scores for all methods with and without retrieval, providing baseline context (e.g., 22% for 0-shot direct vs. 33% for 0shot+Translative with BanglaBERT)."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The dataset contains only 200 QA pairs from 27 Wikipedia articles. No justification is given for why 200 questions or 27 articles were chosen."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No variance, standard deviation, or multi-run results are reported. All results appear to be from single runs."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The paper compares against four baseline methods: zero-shot, 2-shot, Self-Ask, and ReAct (Section 3.2, Table 3)."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Baselines include ReAct (Yao et al., 2023) and Self-Ask (Press et al., 2023), which are contemporary prompting methods for the task."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Table 3 shows results for different TraSe combinations (0shot+Translative, 2shot+Translative, SelfAsk+Translative, ReAct+Translative) compared to standalone methods, effectively demonstrating which component combinations matter."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Both accuracy and F1 score are reported for all methods in Table 3."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Section 3.6 states 'The generated answers were manually evaluated and assigned as right or wrong answers. Based on manual evaluation the accuracy has been determined.'"
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No mention of train/dev/test splits. The temperature was tuned on what appears to be the same 200 questions used for final evaluation (Figure 1), with no held-out test set."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Figure 4 breaks down results by text-based vs. number-based answers, and Table 3 breaks down by retrieval method (BanglaBERT, Bert-multilingual, HIL)."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "The paper provides one example in Section 3.6 illustrating why exact match fails as a metric, but does not analyze where or why the system produces wrong answers for the 37-66% of cases it fails on."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "All TraSe variants are presented as improvements. The paper does not discuss any approaches that were tried and abandoned, or configurations that degraded performance."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Abstract claims of 34% accuracy with automatic retrieval and 63% with HIL retrieval are confirmed by Table 3 (0shot+Translative: 0.34 with BanglaBERT, 0.63 with HIL). The claim of 'outperforming baseline methods' is supported by the comparison table."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The paper claims TraSe 'improves' and 'enhances' performance. The experimental design holds the model constant (Llama 2 7B) and varies only the prompting/selection method, which is adequate single-variable manipulation for these causal claims."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The title claims 'Enhanced Retrieval-Augmented Generation in Bangla' broadly, but the study uses only one model (Llama 2 7B), one dataset (200 questions from 27 Wikipedia articles), and one language. The abstract claims 'potential to enhance question-answering systems for Bangla and similar languages' without testing any other language."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "No alternative explanations are considered. For example, the translative method's advantage could be because Llama 2 has far more English training data, but this confound is not discussed. The selection mechanism's advantage from having two answer candidates is also not analyzed."
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper measures accuracy on QA pairs and frames results as QA accuracy. The measurements match the granularity of the claims without overreaching into broader constructs."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "The paper says 'Llama 2 7B model' but does not specify the exact variant (base vs. chat vs. instruct) or the HuggingFace model ID. BanglaBERT and bert-base-multilingual-cased are named but without version numbers."
    146       },
    147       "prompts_provided": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "The prompting strategies (zero-shot, 2-shot, translative, etc.) are described conceptually and in flowcharts (Figures 2-3), but the actual prompt text used is never provided."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Section 3.3 reports temperature (0.0001), top_k (10), max sequence length (3000 tokens), and data type (bfloat16)."
    156       },
    157       "scaffolding_described": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "The TraSe architecture is described in Section 3.5 and Figure 3, showing the multi-step pipeline: embedding, retrieval, parallel prompting (translative + baseline), and LLM-based answer selection."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 3.1 and Table 1 document the preprocessing: Bangla Wikipedia dump → 27 articles → chunking into 5-sentence segments (710 chunks) → 200 QA pairs with 3 related contexts each."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "A dedicated 'Limitations' section appears after the Conclusion, discussing the use of a single language model and smaller sample size."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The Limitations section identifies study-specific threats: single language model may not generalize across models, and the smaller sample size (200 questions) 'may affect the generalizability of the results.'"
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The Limitations section mentions needing more models and languages, but does not explicitly state what the results do NOT show (e.g., 'our results apply only to Bangla Wikipedia QA with Llama 2 7B')."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "The 200 QA pairs and their manual evaluation labels are not explicitly made available. Only code is mentioned as released."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section 3.1 describes the data source (Bangla Wikipedia dump), structure (27 articles, 53,575 words, 710 chunks, 200 QA pairs), chunking strategy (5 sentences), and answer types (70 text-based, 130 number-based)."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No human participants were recruited. Data comes from Bangla Wikipedia, a public data source."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "The pipeline from Wikipedia dump to final QA pairs has gaps: how were the 27 specific articles selected from the dump? How were the 200 questions created from those articles? Who authored the questions? These steps are unexplained."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No funding source, acknowledgments section, or grant information is provided anywhere in the paper."
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Author affiliations are clearly listed: Khulna University of Engineering & Technology and Noakhali Science & Technology University, Bangladesh. No commercial product is being evaluated."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of any funding statement means this criterion is not satisfied."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests or financial interests statement is included in the paper."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "The paper uses Llama 2 7B to answer questions derived from Bangla Wikipedia but never states the model's training data cutoff date."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "The QA pairs are derived from Bangla Wikipedia, which is almost certainly included in Llama 2's training data. This overlap is never discussed."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "Since the benchmark is derived from Wikipedia and Llama 2 was trained on internet data including Wikipedia, the model may already know the answers. This contamination risk is not addressed."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants. The 'Human-in-the-Loop' component refers to providing oracle retrieval contexts, not a human subjects study."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants involved in this study."
    256       },
    257       "demographics_reported": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants involved in this study."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants involved in this study."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants involved in this study."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants involved in this study."
    276       },
    277       "attrition_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants involved in this study."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No inference cost, latency, or per-query compute time is reported despite TraSe requiring multiple LLM calls per query."
    288       },
    289       "compute_budget_stated": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No hardware details, GPU hours, or total compute budget are mentioned."
    293       }
    294     },
    295     "experimental_rigor": {
    296       "seed_sensitivity_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No multi-seed results reported. All results appear to be from a single run with a fixed temperature of 0.0001."
    300       },
    301       "number_of_runs_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The number of experimental runs is not stated. Results appear to be from single runs."
    305       },
    306       "hyperparameter_search_budget": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "Figure 1 shows 6 temperature values tested, but no systematic search method or total budget is described. Other hyperparameters (top_k=10) appear chosen without stated justification."
    310       },
    311       "best_config_selection_justified": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Temperature 0.0001 was selected based on accuracy in Figure 1, but this appears to be evaluated on the same data used for final results — no separate validation set is mentioned."
    315       },
    316       "multiple_comparison_correction": {
    317         "applies": false,
    318         "answer": false,
    319         "justification": "No statistical tests are performed at all, so multiple comparison correction is inapplicable."
    320       },
    321       "self_comparison_bias_addressed": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The authors evaluate their own TraSe system against baselines they implemented. No acknowledgment of potential author-evaluation bias."
    325       },
    326       "compute_budget_vs_performance": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "TraSe requires at least 3 LLM calls per query (translative + baseline + selector) vs. 1 for baselines, but this computational cost difference is never discussed or compared against the accuracy gain."
    330       },
    331       "benchmark_construct_validity": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The paper uses a custom 200-question benchmark without discussing whether these 200 questions adequately measure RAG capability for Bangla, or whether the text/number split is representative."
    335       },
    336       "scaffold_confound_addressed": {
    337         "applies": false,
    338         "answer": false,
    339         "justification": "The scaffold (TraSe architecture) IS the method being evaluated. All comparisons use the same underlying model (Llama 2 7B) with different prompting/scaffolding strategies as the independent variable."
    340       }
    341     },
    342     "data_leakage": {
    343       "temporal_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "Llama 2 was trained on data including Wikipedia (collected before July 2023). The Bangla Wikipedia dump used for the benchmark predates this. Temporal leakage is not discussed."
    347       },
    348       "feature_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether the retrieval contexts or question phrasing leak answer information beyond what would be available in a real deployment."
    352       },
    353       "non_independence_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether the 200 questions from 27 articles introduce structural dependencies (e.g., multiple questions from the same article sharing context)."
    357       },
    358       "leakage_detection_method": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No leakage detection or prevention method is applied despite the high risk of Wikipedia content being in Llama 2's training data."
    362       }
    363     }
    364   },
    365   "claims": [
    366     {
    367       "claim": "TraSe achieves 34% accuracy with automatic retrieval (BanglaBERT) and 63% with Human-in-the-Loop retrieval for Bangla QA.",
    368       "evidence": "Table 3 shows 0shot+Translative TraSe achieving F1=0.50/accuracy=0.33 with BanglaBERT, F1=0.51/accuracy=0.34 with BanglaBERT (2shot variant), and F1=0.77/accuracy=0.63 with HIL retrieval.",
    369       "supported": "moderate"
    370     },
    371     {
    372       "claim": "TraSe outperforms all baseline prompting methods across retrieval settings.",
    373       "evidence": "Table 3 shows TraSe variants consistently achieving higher accuracy and F1 scores than standalone methods (zero-shot: 22%→33% with BanglaBERT, 51%→63% with HIL). However, no statistical tests confirm these differences are significant on a 200-question dataset.",
    374       "supported": "weak"
    375     },
    376     {
    377       "claim": "Translative prompting is particularly effective for text-based answers compared to other prompting methods.",
    378       "evidence": "Figure 4 shows the Translative method achieving the highest accuracy for text-based answers across all retrieval settings (0.28 for BanglaBERT text vs. 0.18 for zero-shot, 0.61 for HIL text vs. 0.36 for zero-shot).",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "The TraSe architecture marks a significant advancement in RAG for low-resource languages.",
    383       "evidence": "Only demonstrated on Bangla with a single model (Llama 2 7B) and 200 questions. Peak accuracy is 63% with oracle retrieval and 34% with automatic retrieval. No other low-resource languages tested.",
    384       "supported": "weak"
    385     }
    386   ],
    387   "red_flags": [
    388     {
    389       "flag": "Tiny dataset",
    390       "detail": "Only 200 QA pairs from 27 Wikipedia articles. This is extremely small for making generalizable claims about RAG performance for an entire language. With 70 text-based and 130 number-based questions, per-category samples are even smaller."
    391     },
    392     {
    393       "flag": "No statistical significance tests",
    394       "detail": "The paper claims 'significant improvements' and that TraSe 'outperforms' baselines, but provides no statistical tests. On a 200-question dataset, many of the observed differences could be within random variation."
    395     },
    396     {
    397       "flag": "Test set contamination via hyperparameter tuning",
    398       "detail": "Temperature was optimized by evaluating on what appears to be the same 200 questions used for final results (Figure 1). No held-out validation or test set is described, meaning the reported accuracy may be inflated."
    399     },
    400     {
    401       "flag": "Wikipedia contamination risk",
    402       "detail": "All 200 QA pairs are derived from Bangla Wikipedia, which is almost certainly in Llama 2's training data. The model may already know the answers independently of retrieval, confounding the evaluation of the RAG system."
    403     },
    404     {
    405       "flag": "Overclaiming from limited evidence",
    406       "detail": "The abstract claims TraSe 'marks a significant advancement in RAG for low-resource languages' based on one model, one language, 200 questions, and 34% peak automatic retrieval accuracy. The 63% figure relies on human-provided oracle contexts."
    407     },
    408     {
    409       "flag": "No error analysis for majority failure cases",
    410       "detail": "The system fails on 37-66% of questions depending on configuration, but no analysis is provided of why it fails. The paper only discusses successes."
    411     }
    412   ],
    413   "cited_papers": [
    414     {
    415       "title": "Retrieval-Augmented Generation for Large Language Models: A Survey",
    416       "authors": ["Yunfan Gao", "Yun Xiong", "Xinyu Gao"],
    417       "year": 2023,
    418       "relevance": "Comprehensive survey of RAG techniques for LLMs, directly relevant to understanding the RAG landscape this paper builds upon."
    419     },
    420     {
    421       "title": "React: Synergizing Reasoning and Acting in Language Models",
    422       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"],
    423       "year": 2023,
    424       "relevance": "Introduces the ReAct prompting framework used as a baseline in this study, relevant to agentic LLM workflows."
    425     },
    426     {
    427       "title": "Language Models are Few-Shot Learners",
    428       "authors": ["Tom B. Brown", "Benjamin Mann", "Nick Ryder"],
    429       "year": 2020,
    430       "relevance": "Foundational GPT-3 paper establishing few-shot prompting, which is a baseline method in this study."
    431     },
    432     {
    433       "title": "BanglaBERT: Language Model Pretraining and Benchmarks for Low-Resource Language Understanding Evaluation in Bangla",
    434       "authors": ["Abhik Bhattacharjee", "Tahmid Hasan", "Wasi Ahmad"],
    435       "year": 2022,
    436       "relevance": "Provides the BanglaBERT embedding model used in this study's retrieval pipeline, key resource for Bangla NLP."
    437     },
    438     {
    439       "title": "Corrective Retrieval Augmented Generation",
    440       "authors": ["Shi-Qi Yan", "Jia-Chen Gu", "Yun Zhu"],
    441       "year": 2024,
    442       "relevance": "Introduces retrieval quality evaluation and dynamic retrieval actions in RAG, relevant to improving RAG reliability."
    443     },
    444     {
    445       "title": "Ragas: Automated Evaluation of Retrieval Augmented Generation",
    446       "authors": ["ES Shahul", "Jithin James", "Luis Espinosa Anke"],
    447       "year": 2023,
    448       "relevance": "Proposes reference-free evaluation framework for RAG pipelines, relevant to RAG evaluation methodology."
    449     },
    450     {
    451       "title": "A Survey on Retrieval-Augmented Text Generation for Large Language Models",
    452       "authors": ["Yizheng Huang", "Xiangji Huang"],
    453       "year": 2024,
    454       "relevance": "Survey covering RAG challenges in evaluation, retrieval quality, and real-world implementation."
    455     },
    456     {
    457       "title": "Active Retrieval Augmented Generation (FLARE)",
    458       "authors": ["Zhengbao Jiang", "Frank F. Xu", "Luyu Gao"],
    459       "year": 2023,
    460       "relevance": "Introduces active retrieval mechanism for continuous information gathering during generation, advancing RAG methodology."
    461     }
    462   ],
    463   "engagement_factors": {
    464     "practical_relevance": {
    465       "score": 1,
    466       "justification": "Potentially useful for Bangla NLP practitioners but limited by 34% accuracy and single-model evaluation."
    467     },
    468     "surprise_contrarian": {
    469       "score": 0,
    470       "justification": "Confirms the expected finding that low-resource languages need specialized approaches; no surprising results."
    471     },
    472     "fear_safety": {
    473       "score": 0,
    474       "justification": "No AI safety or security concerns raised."
    475     },
    476     "drama_conflict": {
    477       "score": 0,
    478       "justification": "No controversy or conflict with existing work."
    479     },
    480     "demo_ability": {
    481       "score": 2,
    482       "justification": "Code is available on GitHub, though reproducibility is limited by missing environment specs and prompts."
    483     },
    484     "brand_recognition": {
    485       "score": 0,
    486       "justification": "Authors from Bangladeshi universities, no well-known lab or product association."
    487     }
    488   }
    489 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs