scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24743B)
      1 {
      2   "paper": {
      3     "title": "BanglaForge: LLM Collaboration with Self-Refinement for Bangla Code Generation",
      4     "authors": [
      5       "Mahir Labib Dihan",
      6       "Sadif Ahmed",
      7       "Md Nafiu Rahman"
      8     ],
      9     "year": 2025,
     10     "venue": "BLP-2025 (Workshop on Bangla Language Processing)"
     11   },
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "No repository URL or code archive is provided in the paper. The paper references the BLP-2025 starter kit URL for the dataset but does not provide a link to the BanglaForge implementation itself."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The dataset comes from the BLP-2025 shared task and is publicly available via the official starter kit at https://noshinulfat.github.io/blp25_code_generation_task/#/get-started. External benchmarks mHumanEval-Bangla and MBPP-Bangla are also publicly available."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. Appendix B mentions generation parameters (temperature, top_p, max_new_tokens) but does not describe the software environment, library versions, or hardware used."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are provided. While the methodology is described in detail and prompts are given in the appendix, there are no explicit instructions for running the pipeline end-to-end."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "All results are reported as single-point Pass@1 percentages (e.g., 84.00%, 95.5%) with no confidence intervals, error bars, or uncertainty estimates."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper makes comparative claims (e.g., ablation components cause performance drops) but uses no statistical significance tests. Differences are simply compared as raw percentage points."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper reports percentage improvements with baseline context throughout. For example, the ablation study in Table 11 shows the full model at 95.5% vs. without translation at 73.6% (a 21.9 percentage point drop), without feedback loop at 69.8%, etc. These provide sufficient context for understanding effect magnitudes."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The dataset sizes are stated (Trial: 74, Dev: 400, Test: 500 in Table 1) but no justification is given for why these sizes are adequate for the claims made. No power analysis or acknowledgment of sample size limitations."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No variance, standard deviation, or any spread measure is reported. Results appear to be from single runs. There is no mention of multiple runs or seeds."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Table 2 compares multiple models including Gemma-1B, GPT-OSS-20B, DeepSeek-R1-Llama-70B, Gemini-2.0-Flash, and Lg Exaone Deep 32B under various configurations, serving as baselines for comparison."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The baselines include contemporary models such as DeepSeek-R1-Llama-70B (2025), Gemini-2.0-Flash (2024), Gemini-2.5-Pro (2025), and Lg Exaone Deep 32B (2024). These are recent and competitive models."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Appendix E contains a thorough ablation study with 8 variants (Tables 4-11): removing translation, glossary, feedback loop, reviewer, RAG, varying iteration count M, and varying k. Each isolates a single component's contribution."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "Only Pass@1 accuracy is reported. No other metrics such as Pass@k (for k>1), compilation rate, token efficiency, or latency are reported."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No human evaluation of generated code quality is performed. All evaluation is automated via unit test execution (Pass@1). Given claims about code 'robustness' and 'stylistic reliability,' human evaluation of code quality would have been relevant."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "The paper explicitly separates development (400 problems) and test (500 problems) sets. Development set results are used for model selection and ablations; final results are reported on the held-out test set (Section 5.3, Table 2)."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "Only overall Pass@1 accuracy is reported. No per-category, per-difficulty, or per-problem-type breakdowns are provided despite the dataset containing diverse problem types."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Appendix G (Failure Cases and Dataset Limitations) discusses specific failure modes including semantic translation errors (e.g., 'even' translated as 'এমনকি' instead of 'জোড়'), incorrect terminology (e.g., 'Map' translated as geographic map), and loss of context/intent."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The ablation study reports several configurations that hurt performance (e.g., removing translation drops to 73.6%, removing feedback loop drops to 69.8%). Table 10 shows that increasing k beyond 5 slightly decreases performance (94.7% at k=7 vs. 95.5% at k=5)."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims 'Pass@1 accuracy of 84.00%' on the BLP-2025 benchmark, which matches the test set result in Table 2 (Gemini-2.5-Pro with RAG: 84.00%). The abstract's claims about 'effectiveness of retrieval, model collaboration, and self-refinement' are supported by the ablation study."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper makes causal claims via ablation studies (e.g., 'removing the translation stage reduces Pass@1 accuracy by nearly 22 percent'). The ablation design uses controlled single-variable manipulation, which is adequate for these within-system component contribution claims."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The abstract and conclusion claim effectiveness for 'low-resource Bangla code generation' broadly, but results are only on the BLP-2025 benchmark which consists of machine-translated English problems. The paper's title and framing suggest generality to Bangla code generation, but the dataset is machine-translated MBPP/HumanEval, not natively Bangla programming tasks. The Limitations section (Section 7) acknowledges this partially but the title and abstract do not bound the claims."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper does not discuss alternative explanations for its results. For example, it does not consider whether the improvement from translation is simply because the models were trained on English code and thus English prompts are inherently easier, rather than the translation component being novel. No threats-to-validity or confounds discussion."
    129       }
    130     },
    131     "setup_transparency": {
    132       "model_versions_specified": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "Models are referenced by marketing names only: 'Gemini-2.0-Flash', 'Gemini-2.5-Pro', 'DeepSeek-R1-Llama-70B', 'Lg Exaone Deep 32B'. No API versions, snapshot dates, or specific model IDs are provided. The references list 'Accessed: 2025-10-05' for model pages but not specific model version identifiers."
    136       },
    137       "prompts_provided": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Full prompt templates are provided in Appendix C (Figures 3-9): system prompts for coder, reviewer, and translator models, main prompt templates, failed attempt feedback templates, and few-shot example templates. While these contain placeholders, the placeholder semantics are clearly defined and the fill values come from the dataset (instructions, tests, etc.)."
    141       },
    142       "hyperparameters_reported": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Appendix B states: 'temperature = 0.7, top_p = 0.9, and max_new_tokens = 1024. Each query generated n = 1 output sample per decoding pass.' Additional parameters: k=5 for retrieval, M=5 for maximum refinement iterations."
    146       },
    147       "scaffolding_described": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "The agentic scaffolding (translation -> retrieval -> coder -> reviewer -> feedback loop) is described in detail in Section 4 and Algorithm 1 (Appendix F). The workflow includes tool descriptions, the retry logic (max M=5 iterations), and the feedback mechanism with error categories (Table 3)."
    151       },
    152       "data_preprocessing_documented": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Section 3 describes the dataset structure, splits (Table 1: Trial 74, Dev 400, Test 500), and the JSON format of each entry. Section 4.1 describes function prototype normalization. Section 4.2 explains how the retrieval database is constructed (Trial set for Dev experiments, Trial+Dev for Test experiments)."
    156       }
    157     },
    158     "limitations_and_scope": {
    159       "limitations_section_present": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 7 ('Limitations') provides a dedicated discussion of five specific limitations: translation quality dependency, TF-IDF retrieval limitations, assumption of well-structured input, fixed iteration limits, and machine-translated dataset limitations."
    163       },
    164       "threats_to_validity_specific": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "The limitations section discusses threats specific to this study: 'the system relies heavily on high-quality bilingual translation; inaccuracies in Bangla-to-English mapping or glossary coverage can propagate errors,' and 'since the dataset itself originates from machine-translated English sources, true Bangla-native problem framing and linguistic diversity remain under-represented.'"
    168       },
    169       "scope_boundaries_stated": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "The paper does not explicitly state what the results do NOT show. While limitations are discussed, there is no explicit statement bounding claims (e.g., 'our results do not generalize to natively-written Bangla problems' or 'we have not tested on languages other than Bangla'). The title claims 'Bangla Code Generation' broadly without qualifying that this is only tested on machine-translated benchmark problems."
    173       }
    174     },
    175     "data_integrity": {
    176       "raw_data_available": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The benchmark dataset is publicly available via the BLP-2025 starter kit, but the authors' experimental outputs (generated code, per-problem results, logs of refinement iterations) are not released for independent verification."
    180       },
    181       "data_collection_described": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 3 describes the data source: 'Our dataset comes from the Bangla Code Generation shared task (Task 2) at BLP-2025.' The dataset structure (id, Bangla instruction, response, test_list) and splits are clearly documented."
    185       },
    186       "recruitment_methods_described": {
    187         "applies": false,
    188         "answer": false,
    189         "justification": "No human participants were involved. The study uses publicly available benchmark datasets."
    190       },
    191       "data_pipeline_documented": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "The data pipeline from input (Bangla instruction + unit tests) through translation, retrieval, code generation, review, and refinement is documented in Section 4 and Algorithm 1. The retrieval database construction is also described (Section 4.2)."
    195       }
    196     },
    197     "conflicts_of_interest": {
    198       "funding_disclosed": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No funding information, acknowledgments section, or grant numbers are mentioned anywhere in the paper."
    202       },
    203       "affiliations_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Author affiliations are clearly stated: all three authors are from the Department of Computer Science and Engineering, Bangladesh University of Engineering and Technology (BUET)."
    207       },
    208       "funder_independent_of_outcome": {
    209         "applies": false,
    210         "answer": false,
    211         "justification": "No funding is disclosed. The authors are university researchers with no apparent corporate affiliation that would create a conflict regarding the evaluated models."
    212       },
    213       "financial_interests_declared": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No competing interests or financial interests statement is present in the paper."
    217       }
    218     },
    219     "contamination": {
    220       "training_cutoff_stated": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "The paper uses multiple pre-trained LLMs (Gemini-2.5-Pro, Lg Exaone Deep 32B, etc.) on code generation benchmarks derived from HumanEval and MBPP, but does not state the training data cutoff dates for any of the models used."
    224       },
    225       "train_test_overlap_discussed": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No discussion of potential train/test overlap. The benchmarks are Bangla translations of HumanEval and MBPP, which are well-known English benchmarks. The models may have seen the English originals during pre-training, which could affect performance since the pipeline translates Bangla back to English. This contamination vector is not addressed."
    229       },
    230       "benchmark_contamination_addressed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "HumanEval (2021) and MBPP are well-known benchmarks published before the training cutoffs of the models used. The Bangla translations may provide partial decontamination, but the pipeline explicitly translates back to English before generating code, potentially re-introducing contamination. This is not discussed."
    234       }
    235     },
    236     "human_studies": {
    237       "pre_registered": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No human participants were involved in the study."
    241       },
    242       "irb_or_ethics_approval": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants were involved in the study."
    246       },
    247       "demographics_reported": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants were involved in the study."
    251       },
    252       "inclusion_exclusion_criteria": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants were involved in the study."
    256       },
    257       "randomization_described": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants were involved in the study."
    261       },
    262       "blinding_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants were involved in the study."
    266       },
    267       "attrition_reported": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants were involved in the study."
    271       }
    272     },
    273     "cost_and_practicality": {
    274       "inference_cost_reported": {
    275         "applies": true,
    276         "answer": false,
    277         "justification": "No inference cost, API cost, tokens consumed, or wall-clock time is reported. The system calls multiple LLMs (translator, coder, reviewer) with up to 5 refinement iterations per problem, which could be expensive, but cost is never mentioned."
    278       },
    279       "compute_budget_stated": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No computational budget, hardware specifications, GPU hours, or total API spend is reported. The paper does not mention what hardware or infrastructure was used to run the experiments."
    283       }
    284     }
    285   },
    286   "claims": [
    287     {
    288       "claim": "BanglaForge achieves 84.00% Pass@1 accuracy on the BLP-2025 Bangla Code Generation benchmark test set.",
    289       "evidence": "Table 2 shows Gemini-2.5-Pro with RAG (Trial+Dev), 5 examples, translation, and 1 unit test achieves 84.00% Pass@1 on the test set.",
    290       "supported": "strong"
    291     },
    292     {
    293       "claim": "English translation is the most impactful component, with removal causing a 21.9 percentage point drop in accuracy.",
    294       "evidence": "Table 4 (Appendix E.1) shows Full Model at 95.5% vs. Bangla Only at 73.6% on the development set using Lg Exaone Deep 32B.",
    295       "supported": "moderate"
    296     },
    297     {
    298       "claim": "The feedback loop is critical, with removal causing a 25.7 percentage point drop.",
    299       "evidence": "Table 6 (Appendix E.3) shows Full Model at 95.5% vs. Without Feedback Loop at 69.8% on the development set.",
    300       "supported": "moderate"
    301     },
    302     {
    303       "claim": "Retrieval augmentation provides consistent improvement over manual few-shot prompting.",
    304       "evidence": "Table 9 (Appendix E.6) shows RAG at 95.5% vs. Manual Few-shot at 94.2%, a 1.3 percentage point improvement on the development set.",
    305       "supported": "weak"
    306     },
    307     {
    308       "claim": "The Reviewer LLM improves robustness and coverage, with removal causing a 5.1 percentage point drop.",
    309       "evidence": "Table 7 (Appendix E.4) shows Full Model at 95.5% vs. Without Reviewer at 90.4% on the development set.",
    310       "supported": "moderate"
    311     },
    312     {
    313       "claim": "The controlled translation glossary improves accuracy by over 7 percentage points.",
    314       "evidence": "Table 5 (Appendix E.2) shows With Glossary at 95.5% vs. Without Glossary at 88.2% on the development set.",
    315       "supported": "moderate"
    316     }
    317   ],
    318   "methodology_tags": [
    319     "benchmark-eval"
    320   ],
    321   "key_findings": "BanglaForge is a retrieval-augmented dual-LLM framework for generating Python code from Bangla natural language descriptions that achieves 84.00% Pass@1 on the BLP-2025 test set. The system combines Bangla-to-English translation, TF-IDF-based example retrieval, a coder-reviewer architecture, and iterative self-refinement with execution feedback. Ablation studies on the development set show that English translation and the feedback loop are the two most impactful components, each contributing over 20 percentage points to performance. The results suggest that current LLMs still struggle with direct Bangla code generation and benefit substantially from translation-mediated approaches.",
    322   "red_flags": [
    323     {
    324       "flag": "No variance or multiple runs",
    325       "detail": "All results appear to be from single runs with no error bars, standard deviations, or repeated experiments. Given that LLM outputs are stochastic (temperature=0.7), results could vary substantially across runs."
    326     },
    327     {
    328       "flag": "Contamination risk via back-translation",
    329       "detail": "The benchmark consists of machine-translated HumanEval and MBPP problems. The pipeline translates Bangla instructions back to English before code generation. The models may have seen the original English HumanEval/MBPP problems during pre-training, which could inflate performance. This contamination vector is never discussed."
    330     },
    331     {
    332       "flag": "Single metric evaluation",
    333       "detail": "Only Pass@1 is reported. Additional metrics like Pass@k, compilation success rate, or code quality measures would provide a more complete picture of system performance."
    334     },
    335     {
    336       "flag": "Ablation on dev set only, final results use different model",
    337       "detail": "All ablation experiments use Lg Exaone Deep 32B on the development set, but the final test set result (84.00%) uses Gemini-2.5-Pro. It is unclear whether the component contributions observed with one model transfer to the other. The Lg Exaone Deep 32B test set result (80.60%) is lower than many ablated dev set configurations."
    338     },
    339     {
    340       "flag": "No cost reporting despite multi-LLM pipeline",
    341       "detail": "The system calls three separate LLMs (translator, coder, reviewer) with up to 5 refinement iterations per problem, but no cost, latency, or resource usage is reported. This makes practical applicability assessment impossible."
    342     }
    343   ],
    344   "cited_papers": [
    345     {
    346       "title": "Evaluating large language models trained on code",
    347       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    348       "year": 2021,
    349       "arxiv_id": "2107.03374",
    350       "relevance": "Introduces HumanEval, the foundational code generation benchmark from which the Bangla evaluation dataset is derived."
    351     },
    352     {
    353       "title": "mHumanEval - a multilingual benchmark to evaluate large language models for code generation",
    354       "authors": ["Nishat Raihan", "Antonios Anastasopoulos", "Marcos Zampieri"],
    355       "year": 2025,
    356       "relevance": "Multilingual extension of HumanEval used as one of the evaluation benchmarks in this paper; directly relevant to cross-lingual code generation evaluation."
    357     },
    358     {
    359       "title": "TigerCoder: A Novel Suite of LLMs for Code Generation in Bangla",
    360       "authors": ["Nishat Raihan", "Antonios Anastasopoulos", "Marcos Zampieri"],
    361       "year": 2025,
    362       "arxiv_id": "2509.09101",
    363       "relevance": "Introduces Bangla-specific code LLMs and the MBPP-Bangla benchmark used in this paper's evaluation."
    364     },
    365     {
    366       "title": "BenLLMEval: A comprehensive evaluation into the potentials and pitfalls of large language models on Bengali NLP",
    367       "authors": ["Mohsinul Kabir", "Mohammed Saidul Islam", "Md Tahmid Rahman Laskar"],
    368       "year": 2023,
    369       "arxiv_id": "2309.13173",
    370       "relevance": "Comprehensive evaluation of LLMs on Bangla NLP tasks revealing performance gaps, relevant to understanding LLM capabilities in low-resource languages."
    371     },
    372     {
    373       "title": "DeepSeek-R1: Reasoning models built on llama-70b",
    374       "authors": ["DeepSeek AI"],
    375       "year": 2025,
    376       "relevance": "One of the baseline models evaluated in the paper for Bangla code generation."
    377     },
    378     {
    379       "title": "Dense passage retrieval for open-domain question answering",
    380       "authors": ["Vladimir Karpukhin", "Barlas Oguz", "Sewon Min"],
    381       "year": 2020,
    382       "relevance": "Foundational work on dense retrieval that the paper compares against in justifying its TF-IDF retrieval approach."
    383     },
    384     {
    385       "title": "BanglaBERT: Language model pretraining and benchmarks for low-resource language understanding evaluation in Bangla",
    386       "authors": ["Abhik Bhattacharjee", "Tahmid Hasan", "Wasi Uddin Ahmad"],
    387       "year": 2022,
    388       "relevance": "Key work on Bangla-specific language model pretraining, relevant to understanding low-resource language model capabilities."
    389     },
    390     {
    391       "title": "BEnQA: A question answering and reasoning benchmark for Bengali and English",
    392       "authors": ["Sheikh Shafayat", "Quamran Hasan H. M.", "Minhajur Rahman Chowdhury Mahim"],
    393       "year": 2024,
    394       "arxiv_id": "2403.10900",
    395       "relevance": "Parallel Bengali-English QA benchmark showing that cross-lingual approaches improve Bengali performance, relevant to the translation-mediated approach used here."
    396     }
    397   ]
    398 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs