scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24757B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Empowering Low-Resource Languages: TraSe Architecture for Enhanced Retrieval-Augmented Generation in Bangla",
      6     "authors": [
      7       "Atia Shahnaz Ipa",
      8       "Mohammad Abu Tareq Rony",
      9       "Mohammad Shariful Islam"
     10     ],
     11     "year": 2025,
     12     "venue": "LM4UC 2025 Workshop",
     13     "arxiv_id": null,
     14     "doi": "10.18653/v1/2025.lm4uc-1.2"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "Abstract claims 34% accuracy with automatic retrieval but Table 3 shows 33% (Bert-base-multilingual) or 34% only for 2-shot configuration. Claims are not fully consistent with results presented.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "Claims TraSe 'improves' accuracy but provides no ablation study isolating what components drive improvement. Only baseline comparisons shown without understanding which TraSe elements matter.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Abstract states 'has the potential to enhance question-answering systems for Bangla and similar languages' but testing is limited to Bangla on one dataset with one LLM. Generalization claim extends beyond evidence.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No discussion of why TraSe works better. Why does selecting between answers help? Is it redundancy, averaging, or answer quality differences? Single explanation assumed without exploration.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Measures binary accuracy on QA pairs but claims this reflects 'RAG performance' and 'answer selection accuracy' without discussing whether binary correctness captures the right outcome for RAG systems.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Dedicated 'Limitations' section present at end of paper identifying constraints.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "Identifies 'single language model' and 'smaller sample size' but these are boilerplate with no specifics. What sample size is adequate? What would multi-model evaluation show? No concrete threat analysis.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Scope implicitly limited to Bangla Wikipedia QA on Llama 2 7B, but explicit scope boundaries (e.g., 'results do not apply to other languages, domains, or models') are not formally stated.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding source mentioned anywhere in paper.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations with Khulna University and Noakhali Science & Technology University clearly stated.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No funding mentioned.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interests statement provided.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms defined: RAG explained as 'combines information retrieval and generative models'; Translative prompting explained with method (translate to English → query → translate back); TraSe architecture described in methodology.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 1.1 explicitly lists three main contributions: (1) 200-QA Bangla dataset, (2) Translative prompting method, (3) TraSe architecture. Reader knows what paper adds.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 provides 3 pages of Related Work extensively discussing RAG evolution, recent innovations (Corrective RAG, SelfMem, Iter-RetGen, etc.), and showing how this work fits in the landscape.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "Paper states 'code is available at the following GitHub repository: https://github.com/Atia6/TraSe-Bangla-RAG.' Code publicly released.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "Paper describes creating a 200-QA dataset but nowhere states that the dataset is publicly available or released. No link to data provided.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "Hyperparameters given (temperature=0.0001, top_k=10, bfloat16) and libraries mentioned (transformers, LangChain) but no requirements.txt, Dockerfile, or complete dependency specification provided.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step reproduction instructions in paper. Code link provided but paper text has no walkthroughs for obtaining data, running pipeline, or reproducing results.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Results reported as single accuracy/F1 numbers with no confidence intervals, error bars, or variance measures across runs or folds.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "Improvements shown (e.g., 22%→33%, 51%→63%) but no statistical significance tests (t-tests, chi-square, etc.) performed to determine if differences are meaningful.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "Percentage point improvements visible (22→33 is 11pp gain) but effect sizes not formally reported or contextualized relative to baseline variance.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "200 total QA pairs used but no justification given for why 200 is adequate. No power analysis. Limitations section acknowledges 'smaller sample size' but provides no target.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Single accuracy numbers reported per condition with no standard deviation, error bars, or cross-validation folds. No evidence of multiple runs.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Four baseline prompting methods compared: Zero-shot, 2-shot, Self-Ask, and ReAct across multiple embedding/retrieval configurations.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": false,
    186           "justification": "Baselines mixed: 0/2-shot from GPT-3 (Brown et al. 2020, 5 years old); ReAct and Self-Ask from 2023. Some baselines dated for 2025. No comparison to recent RAG-specific baselines or 2024 methods.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": false,
    192           "justification": "No ablation isolating TraSe components. Translative method tested alone (Fig 4), but selector component not tested independently. No ablation of selector vs. ensemble baseline.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Both Accuracy and F1 Score reported in tables and text.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "States 'generated answers were manually evaluated and assigned as right or wrong answers.' Human judgment used for assessment.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": false,
    210           "justification": "No mention of train/test/validation split. All 200 QA pairs appear evaluated on same conditions with no held-out set. No cross-validation mentioned.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Results broken down by answer type (text-based vs number-based) in Figure 4 and Table 3, showing different performance patterns.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": false,
    222           "justification": "One example given of exact-match issue (answer correct but not identical to reference) but no systematic failure analysis, error categorization, or discussion of when/why methods fail.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": false,
    228           "justification": "All methods score poorly (max 63% accuracy) and some baselines show '-' (failure) but results not framed as learning from failure. Paper presents improvements without learning from limitations.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "Paper specifies 'Llama 2 7B' but no snapshot date, exact version identifier, or commit hash. Marketing name only, not reproducible version.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "Translative method shown conceptually in Figure 2 but actual prompt text not provided. No examples of system prompts, instruction templates, or exact wording sent to LLM.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Key hyperparameters reported: temperature=0.0001, top_k=10, bfloat16 dtype, max_tokens=3000. Some comprehensiveness though not exhaustive.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "TraSe architecture shown in Figure 3 with clear components: embedding, retrieval, selector LLM pipeline. Translative prompting method described. Scaffolding is transparent.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": false,
    260           "justification": "Only states 'dataset is preprocessed to convert to chunks of 5 sentences' with no details on tokenization, cleaning, normalization, or how 200 QA pairs extracted from 710 chunks.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "No indication that raw data (200 QA pairs, 27 Wikipedia articles, or retrieval corpus) is available for independent verification.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": false,
    274           "justification": "Source stated (Bangla Wikipedia) but collection procedure missing. How were 200 questions generated? Who wrote them? What criteria selected them? All unstated.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human subjects recruited; using Wikipedia.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": false,
    286           "justification": "High-level pipeline shown (27 articles → 710 chunks → 200 QA pairs) but selection mechanism at each step undocumented. How were 200 pairs chosen from 710 chunks?",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Llama 2 training cutoff date not mentioned. Cannot assess whether Wikipedia articles or QA patterns were in training data.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "No discussion of whether Llama 2 may have seen Bangla Wikipedia or related QA examples during pretraining.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "No analysis of whether Wikipedia content was available before Llama 2 training cutoff or whether this affects evaluation validity.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human subjects study.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human subjects study.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human subjects study.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human subjects study.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human subjects study.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human subjects study.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human subjects study.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "No inference cost (USD, tokens, latency) or computational requirements reported. Impractical to estimate resource needs.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "No total computational budget mentioned for training or inference. GPU hours, API costs, or FLOPs not provided.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "TraSe achieves 34% accuracy with automatic retrieval and 63% with Human-in-the-Loop retrieval",
    373       "evidence": "Table 3 shows 33% accuracy (Bert-base-multilingual 0-shot+Translative) and 34% (BanglaBERT 2-shot+Translative) with automatic retrieval; 63% with HIL retrieval (0-shot+Translative)",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "TraSe outperforms baseline methods (zero-shot, 2-shot, Self-Ask, ReAct)",
    378       "evidence": "Table 3 shows TraSe improving accuracy from 22% (baseline 0-shot) to 33-34% and 51% (HIL baseline) to 63% across retrieval methods",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Translative prompting is particularly effective for text-based answers",
    383       "evidence": "Figure 4 shows translative method achieving 0.28-0.61 accuracy on text answers vs 0.07-0.27 for other methods; explicitly stated as 'seen to be useful for text-based answers'",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Llama 2 7B has poor baseline performance on Bangla without translative prompting",
    388       "evidence": "Zero-shot, 2-shot, ReAct methods all achieve <25% accuracy without translative component",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Human-in-the-Loop context retrieval dramatically improves performance (51% vs 18-33% automatic)",
    393       "evidence": "Table 3 consistently shows HIL achieving 43-63% vs automatic retrieval 14-34%",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "200-pair Bangla Wikipedia dataset is adequate for evaluating RAG methods",
    398       "evidence": "Results reported on this dataset size with no justification or comparison",
    399       "supported": "unsupported"
    400     },
    401     {
    402       "claim": "TraSe can enhance question-answering for Bangla and similar low-resource languages",
    403       "evidence": "Only Bangla tested; no testing on other languages; generalization beyond evidence",
    404       "supported": "weak"
    405     }
    406   ],
    407   "methodology_tags": [
    408     "benchmark-eval",
    409     "case-study"
    410   ],
    411   "key_findings": "The paper introduces TraSe, a selective prompting architecture for Bangla retrieval-augmented generation that combines translative prompting (query→English→answer→Bangla) with a selector component. On a 200-pair Wikipedia-based QA dataset, TraSe achieves 33-34% accuracy with automatic retrieval and 63% with human-in-the-loop context insertion, outperforming baseline zero-shot and few-shot prompting. Translative prompting is particularly effective for text-based questions but remains low-performing overall, suggesting fundamental challenges for Bangla RAG on small language models.",
    412   "red_flags": [
    413     {
    414       "flag": "Extremely small evaluation set",
    415       "detail": "200 total QA pairs is too small for statistical significance. No train/test split mentioned; appears all 200 pairs used for evaluation. Limits generalizability."
    416     },
    417     {
    418       "flag": "Suspicious F1-accuracy mismatch",
    419       "detail": "Table 3 reports max accuracy 0.77 (F1) and 0.63 (accuracy) but F1 should be ≤ accuracy when precision/recall defined on same task. Numbers inconsistent or metrics improperly computed."
    420     },
    421     {
    422       "flag": "Human-in-the-Loop results unrealistic",
    423       "detail": "Best result (63%) requires manual context insertion by human. Not a practical 'RAG' system if humans manually select contexts; removes the retrieval challenge."
    424     },
    425     {
    426       "flag": "No ablation of TraSe components",
    427       "detail": "What drives improvement? Translative method alone? Selector ensemble? Different answer sources? No ablation separates effects. Impossible to understand what matters."
    428     },
    429     {
    430       "flag": "Inconsistent abstract results",
    431       "detail": "Abstract claims 34% with automatic retrieval but Table 3 shows 33% (Bert-multilingual primary result) or 34% only for 2-shot. Numbers don't match exactly."
    432     },
    433     {
    434       "flag": "Single model tested",
    435       "detail": "Only Llama 2 7B evaluated. Claims about Bangla RAG cannot generalize without testing other LLMs, which are now dominant (Llama 3, GPT-4, etc.)."
    436     },
    437     {
    438       "flag": "No statistical significance testing",
    439       "detail": "Differences like 22%→33% shown without p-values, CIs, or cross-validation. Cannot determine if improvements are noise or real."
    440     },
    441     {
    442       "flag": "Missing reproduction details",
    443       "detail": "Actual prompts not provided. How are 200 QA pairs selected from Wikipedia? How are contexts chosen for retrieval evaluation? Dataset not publicly available."
    444     },
    445     {
    446       "flag": "No error analysis",
    447       "detail": "One example failure given but no systematic analysis of error types. When/why does system fail? What's the error distribution?"
    448     },
    449     {
    450       "flag": "Baseline comparison weak",
    451       "detail": "No comparison to dedicated low-resource RAG systems or multilingual RAG baselines. Only basic prompting methods compared."
    452     },
    453     {
    454       "flag": "Data leakage risk",
    455       "detail": "Llama 2 training cutoff not stated. Bangla Wikipedia likely in pretraining. Cannot assess whether test set is contaminated."
    456     },
    457     {
    458       "flag": "Unclear data pipeline",
    459       "detail": "How were 200 QA pairs extracted from 710 chunks? By humans? Automatic? Selection criteria unstated. Reproducibility compromised."
    460     }
    461   ],
    462   "cited_papers": [
    463     {
    464       "title": "Retrieval-augmented generation for large language models: A survey",
    465       "relevance": "Foundational survey on RAG paradigm and evolution of techniques that this paper builds on"
    466     },
    467     {
    468       "title": "ReAct: Synergizing reasoning and acting in language models",
    469       "relevance": "Baseline prompting method (ReAct) compared against TraSe in evaluation"
    470     },
    471     {
    472       "title": "BanglaBERT: Language model pretraining and benchmarks for low-resource language understanding evaluation in Bangla",
    473       "relevance": "Provides embedding model (BanglaBERT) used for document retrieval in TraSe architecture"
    474     },
    475     {
    476       "title": "Language Models are Few-Shot Learners",
    477       "relevance": "Introduces few-shot prompting baseline (2-shot) evaluated against TraSe"
    478     },
    479     {
    480       "title": "Active retrieval augmented generation",
    481       "relevance": "FLARE method for iterative retrieval mentioned as RAG advancement"
    482     },
    483     {
    484       "title": "Corrective Retrieval Augmented Generation",
    485       "relevance": "Recent RAG innovation showing retrieval evaluation and dynamic correction strategies"
    486     },
    487     {
    488       "title": "Retrieval-augmented text generation for large language models",
    489       "relevance": "Survey of RAG integration methods and evaluation frameworks"
    490     },
    491     {
    492       "title": "Graph Retrieval-Augmented Generation",
    493       "relevance": "Structured retrieval approach for RAG representing recent advances beyond flat document retrieval"
    494     }
    495   ],
    496   "engagement_factors": {
    497     "practical_relevance": {
    498       "score": 1,
    499       "justification": "Best results (63%) require human context insertion—not practical. Only tested on Bangla with no deployment pathway shown. Limited real-world applicability."
    500     },
    501     "surprise_contrarian": {
    502       "score": 0,
    503       "justification": "Applying known prompting techniques (translation, selection) to new language is incremental. No surprising findings about language models or RAG paradigm."
    504     },
    505     "fear_safety": {
    506       "score": 0,
    507       "justification": "No safety, alignment, or risk discussion. Paper is purely technical on QA accuracy with no safety implications."
    508     },
    509     "demo_ability": {
    510       "score": 2,
    511       "justification": "GitHub code available but 200-pair dataset not released. Can build system but not reproduce exact results. Moderate demo-ability."
    512     },
    513     "drama_conflict": {
    514       "score": 0,
    515       "justification": "No controversy, conflict, or dramatic findings. Technical paper on niche low-resource language RAG without compelling narrative."
    516     },
    517     "brand_recognition": {
    518       "score": 0,
    519       "justification": "Unknown authors from small universities. Published in workshop (LM4UC), not major venue. No institutional prestige or brand recognition."
    520     }
    521   },
    522   "hn_data": {
    523     "threads": [],
    524     "top_points": 0,
    525     "total_points": 0,
    526     "total_comments": 0
    527   }
    528 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs