scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (29515B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Empowering Low-Resource Languages: TraSe Architecture for Enhanced Retrieval-Augmented Generation in Bangla",
      6     "authors": [
      7       "Atia Shahnaz Ipa",
      8       "Mohammad Abu Tareq Rony",
      9       "Mohammad Shariful Islam"
     10     ],
     11     "year": 2025,
     12     "venue": "LM4UC 2025 Workshop",
     13     "arxiv_id": null,
     14     "doi": "10.18653/v1/2025.lm4uc-1.2"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "Abstract claims of 34% accuracy with automatic retrieval and 63% with HIL retrieval are confirmed by Table 3 (0shot+Translative: 0.34 with BanglaBERT, 0.63 with HIL). The claim of 'outperforming baseline methods' is supported by the comparison table.",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper claims TraSe 'improves' and 'enhances' performance. The experimental design holds the model constant (Llama 2 7B) and varies only the prompting/selection method, which is adequate single-variable manipulation for these causal claims.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The title claims 'Enhanced Retrieval-Augmented Generation in Bangla' broadly, but the study uses only one model (Llama 2 7B), one dataset (200 questions from 27 Wikipedia articles), and one language. The abstract claims 'potential to enhance question-answering systems for Bangla and similar languages' without testing any other language.",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No alternative explanations are considered. For example, the translative method's advantage could be because Llama 2 has far more English training data, but this confound is not discussed. The selection mechanism's advantage from having two answer candidates is also not analyzed.",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper measures accuracy on QA pairs and frames results as QA accuracy. The measurements match the granularity of the claims without overreaching into broader constructs.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "A dedicated 'Limitations' section appears after the Conclusion, discussing the use of a single language model and smaller sample size.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "The Limitations section identifies study-specific threats: single language model may not generalize across models, and the smaller sample size (200 questions) 'may affect the generalizability of the results.'",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The Limitations section mentions needing more models and languages, but does not explicitly state what the results do NOT show (e.g., 'our results apply only to Bangla Wikipedia QA with Llama 2 7B').",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding source, acknowledgments section, or grant information is provided anywhere in the paper.",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are clearly listed: Khulna University of Engineering & Technology and Noakhali Science & Technology University, Bangladesh. No commercial product is being evaluated.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of any funding statement means this criterion is not satisfied.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interests statement is included in the paper.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "RAG defined in intro ('combines information retrieval and generative models'). Translative prompting described in 3.4. Low-resource language assumed familiar but not formally defined.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 1.1 explicitly lists three contributions: (1) Bangla QA dataset, (2) Translative prompting method, (3) TraSe architecture. Clear and enumerated.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 reviews RAG evolution (FLARE, Corrective RAG, SelfMem, etc.), identifies gap in low-resource language support, and positions this work as addressing that gap.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "A GitHub repository is provided in the abstract: https://github.com/Atia6/TraSe-Bangla-RAG.",
    123           "source": "opus"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "The paper states they created 200 QA pairs from Bangla Wikipedia dump but only explicitly mentions code availability. The dataset release is not explicitly confirmed; only 'The code is available' is stated.",
    129           "source": "opus"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "The paper mentions Llama 2 7B, transformers library, LangChain, and bfloat16, but provides no requirements.txt, Dockerfile, or detailed environment setup with library versions.",
    135           "source": "opus"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README content or reproduction steps are described.",
    141           "source": "opus"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "All results in Table 3 and Figure 4 are reported as point estimates with no confidence intervals or error bars.",
    149           "source": "opus"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "The paper claims TraSe 'outperforms' baselines and shows 'significant improvements' but provides no statistical significance tests. Comparisons are based solely on numerical differences.",
    155           "source": "opus"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Table 3 reports absolute accuracy and F1 scores for all methods with and without retrieval, providing baseline context (e.g., 22% for 0-shot direct vs. 33% for 0shot+Translative with BanglaBERT).",
    161           "source": "opus"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The dataset contains only 200 QA pairs from 27 Wikipedia articles. No justification is given for why 200 questions or 27 articles were chosen.",
    167           "source": "opus"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No variance, standard deviation, or multi-run results are reported. All results appear to be from single runs.",
    173           "source": "opus"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "The paper compares against four baseline methods: zero-shot, 2-shot, Self-Ask, and ReAct (Section 3.2, Table 3).",
    181           "source": "opus"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Baselines include ReAct (Yao et al., 2023) and Self-Ask (Press et al., 2023), which are contemporary prompting methods for the task.",
    187           "source": "opus"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Table 3 shows results for different TraSe combinations (0shot+Translative, 2shot+Translative, SelfAsk+Translative, ReAct+Translative) compared to standalone methods, effectively demonstrating which component combinations matter.",
    193           "source": "opus"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Both accuracy and F1 score are reported for all methods in Table 3.",
    199           "source": "opus"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "Section 3.6 states 'The generated answers were manually evaluated and assigned as right or wrong answers. Based on manual evaluation the accuracy has been determined.'",
    205           "source": "opus"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": false,
    210           "justification": "No mention of train/dev/test splits. The temperature was tuned on what appears to be the same 200 questions used for final evaluation (Figure 1), with no held-out test set.",
    211           "source": "opus"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Figure 4 breaks down results by text-based vs. number-based answers, and Table 3 breaks down by retrieval method (BanglaBERT, Bert-multilingual, HIL).",
    217           "source": "opus"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": false,
    222           "justification": "The paper provides one example in Section 3.6 illustrating why exact match fails as a metric, but does not analyze where or why the system produces wrong answers for the 37-66% of cases it fails on.",
    223           "source": "opus"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": false,
    228           "justification": "All TraSe variants are presented as improvements. The paper does not discuss any approaches that were tried and abandoned, or configurations that degraded performance.",
    229           "source": "opus"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "The paper says 'Llama 2 7B model' but does not specify the exact variant (base vs. chat vs. instruct) or the HuggingFace model ID. BanglaBERT and bert-base-multilingual-cased are named but without version numbers.",
    237           "source": "opus"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "The prompting strategies (zero-shot, 2-shot, translative, etc.) are described conceptually and in flowcharts (Figures 2-3), but the actual prompt text used is never provided.",
    243           "source": "opus"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Section 3.3 reports temperature (0.0001), top_k (10), max sequence length (3000 tokens), and data type (bfloat16).",
    249           "source": "opus"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "The TraSe architecture is described in Section 3.5 and Figure 3, showing the multi-step pipeline: embedding, retrieval, parallel prompting (translative + baseline), and LLM-based answer selection.",
    255           "source": "opus"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Section 3.1 and Table 1 document the preprocessing: Bangla Wikipedia dump → 27 articles → chunking into 5-sentence segments (710 chunks) → 200 QA pairs with 3 related contexts each.",
    261           "source": "opus"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "The 200 QA pairs and their manual evaluation labels are not explicitly made available. Only code is mentioned as released.",
    269           "source": "opus"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section 3.1 describes the data source (Bangla Wikipedia dump), structure (27 articles, 53,575 words, 710 chunks, 200 QA pairs), chunking strategy (5 sentences), and answer types (70 text-based, 130 number-based).",
    275           "source": "opus"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants were recruited. Data comes from Bangla Wikipedia, a public data source.",
    281           "source": "opus"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": false,
    286           "justification": "The pipeline from Wikipedia dump to final QA pairs has gaps: how were the 27 specific articles selected from the dump? How were the 200 questions created from those articles? Who authored the questions? These steps are unexplained.",
    287           "source": "opus"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "The paper uses Llama 2 7B to answer questions derived from Bangla Wikipedia but never states the model's training data cutoff date.",
    295           "source": "opus"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "The QA pairs are derived from Bangla Wikipedia, which is almost certainly included in Llama 2's training data. This overlap is never discussed.",
    301           "source": "opus"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "Since the benchmark is derived from Wikipedia and Llama 2 was trained on internet data including Wikipedia, the model may already know the answers. This contamination risk is not addressed.",
    307           "source": "opus"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants. The 'Human-in-the-Loop' component refers to providing oracle retrieval contexts, not a human subjects study.",
    315           "source": "opus"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants involved in this study.",
    321           "source": "opus"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants involved in this study.",
    327           "source": "opus"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants involved in this study.",
    333           "source": "opus"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants involved in this study.",
    339           "source": "opus"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants involved in this study.",
    345           "source": "opus"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants involved in this study.",
    351           "source": "opus"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "No inference cost, latency, or per-query compute time is reported despite TraSe requiring multiple LLM calls per query.",
    359           "source": "opus"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "No hardware details, GPU hours, or total compute budget are mentioned.",
    365           "source": "opus"
    366         }
    367       },
    368       "experimental_rigor": {
    369         "seed_sensitivity_reported": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "No multi-seed results reported. All results appear to be from a single run with a fixed temperature of 0.0001.",
    373           "source": "opus"
    374         },
    375         "number_of_runs_stated": {
    376           "applies": true,
    377           "answer": false,
    378           "justification": "The number of experimental runs is not stated. Results appear to be from single runs.",
    379           "source": "opus"
    380         },
    381         "hyperparameter_search_budget": {
    382           "applies": true,
    383           "answer": false,
    384           "justification": "Figure 1 shows 6 temperature values tested, but no systematic search method or total budget is described. Other hyperparameters (top_k=10) appear chosen without stated justification.",
    385           "source": "opus"
    386         },
    387         "best_config_selection_justified": {
    388           "applies": true,
    389           "answer": false,
    390           "justification": "Temperature 0.0001 was selected based on accuracy in Figure 1, but this appears to be evaluated on the same data used for final results — no separate validation set is mentioned.",
    391           "source": "opus"
    392         },
    393         "multiple_comparison_correction": {
    394           "applies": false,
    395           "answer": false,
    396           "justification": "No statistical tests are performed at all, so multiple comparison correction is inapplicable.",
    397           "source": "opus"
    398         },
    399         "self_comparison_bias_addressed": {
    400           "applies": true,
    401           "answer": false,
    402           "justification": "The authors evaluate their own TraSe system against baselines they implemented. No acknowledgment of potential author-evaluation bias.",
    403           "source": "opus"
    404         },
    405         "compute_budget_vs_performance": {
    406           "applies": true,
    407           "answer": false,
    408           "justification": "TraSe requires at least 3 LLM calls per query (translative + baseline + selector) vs. 1 for baselines, but this computational cost difference is never discussed or compared against the accuracy gain.",
    409           "source": "opus"
    410         },
    411         "benchmark_construct_validity": {
    412           "applies": true,
    413           "answer": false,
    414           "justification": "The paper uses a custom 200-question benchmark without discussing whether these 200 questions adequately measure RAG capability for Bangla, or whether the text/number split is representative.",
    415           "source": "opus"
    416         },
    417         "scaffold_confound_addressed": {
    418           "applies": false,
    419           "answer": false,
    420           "justification": "The scaffold (TraSe architecture) IS the method being evaluated. All comparisons use the same underlying model (Llama 2 7B) with different prompting/scaffolding strategies as the independent variable.",
    421           "source": "opus"
    422         }
    423       },
    424       "data_leakage": {
    425         "temporal_leakage_addressed": {
    426           "applies": true,
    427           "answer": false,
    428           "justification": "Llama 2 was trained on data including Wikipedia (collected before July 2023). The Bangla Wikipedia dump used for the benchmark predates this. Temporal leakage is not discussed.",
    429           "source": "opus"
    430         },
    431         "feature_leakage_addressed": {
    432           "applies": true,
    433           "answer": false,
    434           "justification": "No discussion of whether the retrieval contexts or question phrasing leak answer information beyond what would be available in a real deployment.",
    435           "source": "opus"
    436         },
    437         "non_independence_addressed": {
    438           "applies": true,
    439           "answer": false,
    440           "justification": "No discussion of whether the 200 questions from 27 articles introduce structural dependencies (e.g., multiple questions from the same article sharing context).",
    441           "source": "opus"
    442         },
    443         "leakage_detection_method": {
    444           "applies": true,
    445           "answer": false,
    446           "justification": "No leakage detection or prevention method is applied despite the high risk of Wikipedia content being in Llama 2's training data.",
    447           "source": "opus"
    448         }
    449       }
    450     }
    451   },
    452   "claims": [
    453     {
    454       "claim": "TraSe improves answer selection accuracy, achieving 34% with automatic retrieval and 63% with Human-in-the-Loop",
    455       "evidence": "Table 3 reports exact results: automatic retrieval 33% (0-shot), 34% (2-shot); HIL retrieval 63% (0-shot+Translative). Verified in paper.",
    456       "supported": "strong"
    457     },
    458     {
    459       "claim": "Translative prompting outperforms baseline methods (zero-shot, 2-shot, Self-Ask, ReAct) for text-based answers",
    460       "evidence": "Figure 4 shows Translative achieving 0.28 accuracy on text-based answers, outperforming other methods. Figure 4 demonstrates this.",
    461       "supported": "moderate"
    462     },
    463     {
    464       "claim": "TraSe achieves substantial improvement over zero-shot baseline in HIL context (63% vs 51%)",
    465       "evidence": "Table 3 row 'HIL Retrieval' shows zero-shot direct 51%, zero-shot+Translative 63%. Results match claim.",
    466       "supported": "strong"
    467     },
    468     {
    469       "claim": "The TraSe architecture marks a significant advancement in RAG for low-resource languages",
    470       "evidence": "Improvements shown relative to baselines, but baselines are weak (zero-shot, 2-shot). No comparison to existing low-resource RAG systems. Significance is relative.",
    471       "supported": "weak"
    472     },
    473     {
    474       "claim": "Llama 2 7B has not been trained on large amounts of Bangla data",
    475       "evidence": "Stated but not backed by empirical evidence or training data analysis. Reasonable assumption but unsupported.",
    476       "supported": "weak"
    477     },
    478     {
    479       "claim": "The method works effectively on both text-based and number-based Bangla question answering",
    480       "evidence": "Figure 4 shows both answer types evaluated. Text-based Translative: 0.28, number-based: 0.13. Text clearly stronger.",
    481       "supported": "moderate"
    482     },
    483     {
    484       "claim": "Retrieval quality is critical for TraSe performance (63% with human-curated contexts vs 34% with automatic)",
    485       "evidence": "Table 3 shows 12pp gap between automatic and HIL retrieval. Demonstrates sensitivity to context quality.",
    486       "supported": "strong"
    487     }
    488   ],
    489   "methodology_tags": [
    490     "benchmark-eval",
    491     "case-study"
    492   ],
    493   "key_findings": "The paper introduces TraSe, combining translative prompting (translate Bangla→English, generate in English, translate→Bangla) with answer selection, evaluated on a manual 200-question Bangla Wikipedia QA dataset. TraSe achieves 34% accuracy with automatic retrieval and 63% with human-curated contexts, outperforming baselines. The work addresses a gap in RAG research for Bangla, a low-resource language with 230M+ speakers, but evaluation scope is narrow: single language, single model (Llama 2 7B), tiny dataset, no inter-rater reliability metrics, and no comparison to existing low-resource RAG systems.",
    494   "red_flags": [
    495     {
    496       "flag": "Microscopic dataset",
    497       "detail": "200 QA pairs from 27 Wikipedia articles. Insufficient for generalizable conclusions about Bangla RAG. No train/val/test split reported."
    498     },
    499     {
    500       "flag": "No statistical rigor",
    501       "detail": "Point estimates only. No confidence intervals, error bars, significance tests, or variance across runs. Single trial results."
    502     },
    503     {
    504       "flag": "Weak baselines",
    505       "detail": "Zero-shot and few-shot prompting are minimal baselines. No comparison to existing RAG methods, retrieval systems, or Bangla-specific NLP systems."
    506     },
    507     {
    508       "flag": "Manual evaluation without reliability",
    509       "detail": "Accuracy assessed by manual evaluation but no inter-rater agreement (Cohen's kappa, Fleiss' kappa) reported. Single evaluator bias possible."
    510     },
    511     {
    512       "flag": "Prompts not disclosed",
    513       "detail": "Actual prompt text given to Llama 2 not provided. Only flowcharts shown. Critical for reproducibility."
    514     },
    515     {
    516       "flag": "No ablation of TraSe",
    517       "detail": "Translative method ablated but not the selector component. What drives improvement: translation bridge, selector voting, or both? Unknown."
    518     },
    519     {
    520       "flag": "Circular evaluation design",
    521       "detail": "Translative method (English→Bangla) benefits from Llama 2's superior English capabilities. This architectural bias not discussed."
    522     },
    523     {
    524       "flag": "Contamination unaddressed",
    525       "detail": "Evaluating on Bangla Wikipedia content with Llama 2, whose training data cutoff and Bangla inclusion not discussed. Possible data leakage."
    526     },
    527     {
    528       "flag": "Dataset not released",
    529       "detail": "200 QA pairs created but no explicit statement of release. GitHub code link provided but dataset availability unconfirmed."
    530     },
    531     {
    532       "flag": "Overstated claims",
    533       "detail": "Title and abstract claim 'empowering low-resource languages' but evaluation is single language, single domain, single model. Generalization premature."
    534     },
    535     {
    536       "flag": "No environment reproducibility",
    537       "detail": "transformers, LangChain, Llama 2 versions not specified. No requirements.txt or Dockerfile. Environment not reproducible."
    538     },
    539     {
    540       "flag": "Single model evaluation",
    541       "detail": "Only Llama 2 7B tested. Limitations section acknowledges this but no other models evaluated. Generalization to other models unclear."
    542     }
    543   ],
    544   "cited_papers": [
    545     {
    546       "title": "Retrieval-augmented generation for large language models: A survey",
    547       "relevance": "Comprehensive RAG survey; establishes state-of-art and recent advances in the field"
    548     },
    549     {
    550       "title": "Survey on retrieval-augmented text generation for LLMs",
    551       "relevance": "Recent RAG survey documenting methodological landscape"
    552     },
    553     {
    554       "title": "ReAct: Synergizing reasoning and acting in language models",
    555       "relevance": "Key baseline prompting method used in comparative evaluation"
    556     },
    557     {
    558       "title": "Language models are few-shot learners (GPT-3)",
    559       "relevance": "Foundational work on few-shot prompting, baseline method"
    560     },
    561     {
    562       "title": "BanglaBERT: Language model pretraining and benchmarks for low-resource language understanding evaluation in Bangla",
    563       "relevance": "Core embedding model used for retrieval; establishes Bangla NLP baseline"
    564     },
    565     {
    566       "title": "Active retrieval augmented generation (FLARE)",
    567       "relevance": "Advanced RAG technique demonstrating iterative retrieval refinement"
    568     },
    569     {
    570       "title": "Corrective retrieval augmented generation",
    571       "relevance": "Recent RAG innovation using retrieval quality assessment"
    572     },
    573     {
    574       "title": "The power of noise: Redefining retrieval for RAG systems",
    575       "relevance": "Recent work on retrieval quality challenges in RAG"
    576     }
    577   ],
    578   "engagement_factors": {
    579     "practical_relevance": {
    580       "score": 1,
    581       "justification": "Potentially useful for Bangla NLP practitioners but limited by 34% accuracy and single-model evaluation."
    582     },
    583     "surprise_contrarian": {
    584       "score": 0,
    585       "justification": "Confirms the expected finding that low-resource languages need specialized approaches; no surprising results."
    586     },
    587     "fear_safety": {
    588       "score": 0,
    589       "justification": "No AI safety or security concerns raised."
    590     },
    591     "drama_conflict": {
    592       "score": 0,
    593       "justification": "No controversy or conflict with existing work."
    594     },
    595     "demo_ability": {
    596       "score": 2,
    597       "justification": "Code is available on GitHub, though reproducibility is limited by missing environment specs and prompts."
    598     },
    599     "brand_recognition": {
    600       "score": 0,
    601       "justification": "Authors from Bangladeshi universities, no well-known lab or product association."
    602     }
    603   },
    604   "hn_data": {
    605     "threads": [],
    606     "top_points": 0,
    607     "total_points": 0,
    608     "total_comments": 0
    609   }
    610 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs