scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24958B)
      1 {
      2   "paper": {
      3     "title": "BASHEXPLAINER: Retrieval-Augmented Bash Code Comment Generation based on Fine-tuned CodeBERT",
      4     "authors": [
      5       "Chi Yu",
      6       "Guang Yang",
      7       "Xiang Chen",
      8       "Ke Liu",
      9       "Yanlin Zhou"
     10     ],
     11     "year": 2022,
     12     "venue": "arXiv preprint",
     13     "arxiv_id": "2206.13325"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The paper provides a GitHub link to their project: https://github.com/NTDXYG/BASHEXPLAINER (referenced in Section II-C and contributions)."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The authors state they 'shared our corpus and scripts in our project homepage' and the corpus is constructed from publicly available NL2Bash and NLC2CMD data. The constructed corpus of 10,592 samples is shared via the GitHub repository."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper mentions PyTorch, Faiss, Textdistance, and Transformers packages, and states the hardware (Intel Xeon Silver 4210 CPU, RTX3090 GPU, Windows OS), but does not provide a requirements.txt, Dockerfile, or specific library versions needed to recreate the environment."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper references a project homepage with shared corpus and scripts, but does not include step-by-step reproduction instructions in the paper itself. There is no 'Reproducing Results' section or specific commands to run."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The main automated evaluation results in Table IV report only point estimates (e.g., BLEU-4 of 30.44%) with no confidence intervals or error bars. The human study in Table VII reports standard deviations but the automated results do not."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The human study reports that 'All p-values were less than 0.05, which means the comparison result difference exists statistically significant.' However, no significance tests are reported for the automated evaluation results."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The paper reports percentage improvements with baseline context throughout. For example, 'BASHEXPLAINER can improve the performance by at least 8.75%, 9.29%, 4.77% and 3.86%' in BLEU-3/4, METEOR, ROUGE-L over NNGen. Table IV provides full numerical context."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The corpus size of 10,592 samples and the human study sample of 100 Bash codes with 5 students are not justified. No power analysis or reasoning for why these sizes are sufficient is provided."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The automated evaluation results in Tables IV-VI report single-run numbers with no variance, standard deviation, or multiple-run results. The paper mentions splitting datasets three times with different random seeds on the project homepage but does not include those results in the paper. The human study reports standard deviations (Table VII) but the main automated experiments do not."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The paper compares against 10 baselines from three groups: information retrieval methods (LSI, VSM, BM25, NNGen), deep learning methods (CopyNet, Transformer, CODE-NN, CodeBERT), and hybrid methods (Hybrid-DeepCom, Rencos). Results in Table IV."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The baselines include contemporary methods such as CodeBERT (2020), Rencos (2020), Re2com/Wei et al. (2020), and Transformer-based approaches. Since this is the first study on Bash code comment generation, baselines from the broader code comment generation domain are appropriate."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Section V-C (RQ3) presents ablation experiments: with NNGen retrieval, reversed retrieval order, without normalization, with simple fusion, and without NMT. Table VI shows results for each ablation variant."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The paper uses three performance measures: BLEU (BLEU-1/2/3/4), METEOR, and ROUGE-L, all reported in Tables IV-VI."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Section VI-A describes a human study with 5 master students rating 100 Bash code comments on similarity, naturalness, and informativeness (0-4 scale). Results in Table VII. Fleiss Kappa inter-rater agreement is 0.723."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Section IV-A states: 'we use a random sampling method to split the dataset into the training set, the validation set, and the test set in the ratio of 80%: 10%: 10%.' Results are reported on the test set."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "Results are reported only as overall averages across all test examples. There is no breakdown by Bash utility type, command complexity, code length, or other categories. Table VIII shows three qualitative examples but not a systematic per-category analysis."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Section VI-B provides qualitative analysis with examples where methods fail. For example, NNGen retrieves semantically different code for the last two examples in Table VIII, and CodeBERT misses the phrase 'under current directory' for the second example."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper reports that BASHEXPLAINER scores 0.07 points lower than NNGen on Naturalness in the human study (Table VII), honestly noting this weakness. The ablation study also shows where removing components reduces performance, which constitutes negative results for those configurations."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims 'BASHEXPLAINER can outperform all baselines by at least 8.75%, 9.29%, 4.77% and 3.86%' in BLEU-3/4, METEOR, ROUGE-L. These numbers are supported by Table IV. The claims about ablation experiments and human study are also supported by Tables VI and VII."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The paper makes causal claims through ablation studies (e.g., 'construction Bash code encoder by fine-tuning CodeBERT can help to significantly improve the performance'). The ablation design in RQ2 and RQ3 uses controlled single-variable manipulation (removing or replacing one component at a time), which is adequate for these causal claims."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper studies Bash code comment generation but the title and framing suggest broad applicability. Results are limited to the 135 most useful utilities identified by Linux users from the NL2Bash/NLC2CMD datasets, with code lengths mostly under 20 tokens. The paper does not explicitly bound its claims to this specific dataset scope or note that results may not generalize to longer or more complex Bash scripts."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The threats to validity section discusses generic concerns (implementation faults, corpus representativeness, performance measures) but does not consider specific alternative explanations for the results. For example, it does not discuss whether the improvements could be due to the additional training data rather than the architectural innovations, or whether the retrieval component benefits from the small dataset size."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper uses 'CodeBERT' but does not specify which version or checkpoint (e.g., 'microsoft/codebert-base'). No specific model version or release date is provided."
    139       },
    140       "prompts_provided": {
    141         "applies": false,
    142         "answer": false,
    143         "justification": "This paper uses fine-tuned encoder-decoder models, not prompting-based approaches. Prompts are not applicable."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Table III reports hyperparameters: decoder layers (6), hidden size (768), max input length (64), max output length (32), beam search size (10), top-k (8), CodeBERT hidden size (768). Section IV-D also states learning rate (2e-4), optimizer (AdamW), and number of epochs (30)."
    149       },
    150       "scaffolding_described": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "No agentic scaffolding is used. This is a standard encoder-decoder architecture with a retrieval component."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section IV-A describes the data construction: combining NL2Bash and NLC2CMD corpora, removing duplicates, focusing on 135 most useful utilities. The final corpus size (10,592 samples) and train/validation/test split (80/10/10) are documented. Table II provides length statistics."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section VII (Threats to Validity) discusses four categories: internal validity, external validity, construct validity, and conclusion validity."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The threats section includes study-specific concerns: 'Since the Bash code comment generation problem has not been investigated in previous studies, we mainly choose the state-of-the-art baselines from the source code comment generation domain'; the corpus representativeness concern is addressed by combining two corpora; and the single-split concern is addressed by additional runs with different seeds."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "The paper does not explicitly state what it does NOT show. It does not bound its claims to the 135 utilities tested, the short code lengths in the corpus, or acknowledge that results may not hold for longer/more complex Bash scripts or different Bash usage contexts."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The corpus and scripts are shared on the GitHub project page (https://github.com/NTDXYG/BASHEXPLAINER), and the source data comes from publicly available NL2Bash and NLC2CMD datasets."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section IV-A describes data collection: the NL2Bash corpus origin (developer Q&A forums, tutorials, technical websites, course materials), the NLC2CMD competition data, the merging and deduplication process, and the focus on 135 utilities."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The human study hired 'five master students, who have extensive experience in using Bash for Linux system development and maintenance' but does not describe how they were recruited, from which institution's pool, or whether this convenience sample introduces selection bias."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The data pipeline is documented: NL2Bash corpus + NLC2CMD competition data → merged → duplicates removed → 10,592 samples → 80/10/10 split. Table II shows statistics of the final corpus."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "The Acknowledgment section states: 'This work is supported in part by the National Natural Science Foundation of China (Grant no. 61872263).'"
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "All authors are listed with their affiliation: School of Information Science and Technology, Nantong University, China. No product being evaluated is affiliated with the authors."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "The funder is the National Natural Science Foundation of China, a government research funding agency with no financial stake in the outcome of Bash code comment generation research."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "There is no competing interests or financial interests declaration in the paper."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "This paper fine-tunes CodeBERT on a domain-specific corpus and evaluates the fine-tuned model. It does not evaluate a pre-trained model's raw capabilities on a benchmark — the evaluation is of the fine-tuned system's generation quality. Contamination of the pre-trained model is not the primary concern here."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "Same reasoning as training_cutoff_stated. The evaluation is of a fine-tuned system on a held-out test split, not of a pre-trained model's memorized knowledge."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "Same reasoning as training_cutoff_stated. The paper constructs its own corpus with a random train/test split rather than evaluating on a pre-existing public benchmark where contamination would be a concern."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "The paper includes a human study with 5 master students evaluating 100 Bash codes. No pre-registration is mentioned."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "The human study involves human participants rating code comments, but no IRB or ethics board approval is mentioned."
    249       },
    250       "demographics_reported": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "Participants are described only as 'five master students who have extensive experience in using Bash for Linux system development and maintenance.' No further demographics (years of experience, gender, etc.) are reported."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": true,
    257         "answer": false,
    258         "justification": "No inclusion or exclusion criteria for the student evaluators are stated beyond 'extensive experience in using Bash.'"
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "This is not an experimental study with treatment/control group assignment. All evaluators rate all methods. The order of questionnaires is described as different for different students."
    264       },
    265       "blinding_described": {
    266         "applies": true,
    267         "answer": true,
    268         "justification": "The paper states: 'students do not know which comment is generated by which method, and the order of questionnaires is different for different students.' This describes blinding of the evaluators."
    269       },
    270       "attrition_reported": {
    271         "applies": true,
    272         "answer": false,
    273         "justification": "The paper states 5 students were hired but does not report whether all 5 completed the full evaluation or if there was any attrition."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "The paper does not report inference latency or computational cost per example for BASHEXPLAINER, despite the retrieval + neural generation pipeline having non-trivial cost."
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "The paper mentions hardware (RTX3090 GPU) but does not report total training time, GPU hours, or computational budget for the experiments."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "BASHEXPLAINER outperforms all 10 baselines on BLEU-3/4, METEOR, and ROUGE-L by at least 8.75%, 9.29%, 4.77%, and 3.86% respectively.",
    292       "evidence": "Table IV shows automated evaluation results. Best baseline on BLEU-3 is NNGen (32.11) vs BASHEXPLAINER (34.92), a 8.75% improvement. Best baseline on BLEU-4 is NNGen (27.85) vs BASHEXPLAINER (30.44), a 9.29% improvement.",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "The two-stage training strategy significantly improves performance, with BLEU-4 improving by 22.9% compared to single-stage training.",
    297       "evidence": "Table V compares w/o two-stage training (BLEU-4: 24.75%) vs BASHEXPLAINER (30.44%). The 22.9% relative improvement is verified.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "Each component of BASHEXPLAINER contributes to performance, as shown by ablation experiments.",
    302       "evidence": "Table VI shows removing or replacing any component (retrieval module, normalization, fusion layer, NMT) reduces performance across all metrics compared to the full BASHEXPLAINER.",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "BASHEXPLAINER outperforms baselines in human evaluation on informativeness (+15.6%) and similarity (+19.9%).",
    307       "evidence": "Table VII shows human evaluation scores. BASHEXPLAINER scores 2.74 vs Hybrid-DeepCom's 2.37 on informativeness, and 2.41 vs Hybrid-DeepCom's 2.01 on similarity. p < 0.05 for all comparisons.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "BASHEXPLAINER is the first method to study automated Bash code comment generation.",
    312       "evidence": "Section I and II-C argue no prior work exists on Bash comment generation specifically, only on Java/Python comment generation and NL-to-Bash (the reverse direction).",
    313       "supported": "strong"
    314     }
    315   ],
    316   "methodology_tags": [
    317     "benchmark-eval"
    318   ],
    319   "key_findings": "BASHEXPLAINER, a retrieval-augmented code comment generation method based on fine-tuned CodeBERT, outperforms 10 baselines from the code comment generation literature on a newly constructed Bash code corpus of 10,592 samples. The two-stage training strategy (first fine-tuning CodeBERT, then training the full retrieval-augmented system) provides substantial improvements. A human study with 5 evaluators confirms BASHEXPLAINER generates more informative and similar comments compared to baselines, though pure retrieval (NNGen) still produces slightly more natural-sounding comments.",
    320   "red_flags": [
    321     {
    322       "flag": "No variance or confidence intervals for main results",
    323       "detail": "Tables IV-VI report single-run point estimates for all automated evaluation metrics. The paper mentions additional splits with different random seeds on the project homepage but does not include these in the paper, making it impossible to assess result stability."
    324     },
    325     {
    326       "flag": "Improvements may be inflated by percentage calculation",
    327       "detail": "Improvements are reported as relative percentages (e.g., '8.75% improvement') on already-low baseline numbers, which can appear more impressive than the absolute differences. For example, BLEU-4 improves from 27.85 to 30.44, an absolute difference of 2.59 percentage points."
    328     },
    329     {
    330       "flag": "Small-scale human study with convenience sample",
    331       "detail": "The human evaluation uses only 5 master students from a single university. No recruitment details, demographics, or selection criteria are provided beyond 'extensive experience in using Bash.' The 0-4 scale with 100 examples and 5 raters may have limited statistical power."
    332     },
    333     {
    334       "flag": "No per-category performance breakdown",
    335       "detail": "Results are only reported as aggregate averages. There is no breakdown by Bash utility type, command complexity, or code length, making it impossible to assess where the method succeeds or fails systematically."
    336     }
    337   ],
    338   "cited_papers": [
    339     {
    340       "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
    341       "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang"],
    342       "year": 2020,
    343       "relevance": "Foundational pre-trained model for code understanding that BASHEXPLAINER is built upon; relevant to LLM-based code processing survey scope."
    344     },
    345     {
    346       "title": "NL2Bash: A Corpus and Semantic Parser for Natural Language Interface to the Linux Operating System",
    347       "authors": ["Xi Victoria Lin", "Chenglong Wang", "Luke Zettlemoyer", "Michael D. Ernst"],
    348       "year": 2018,
    349       "relevance": "Foundational dataset and task for Bash code generation from natural language, directly relevant to code generation evaluation methodology."
    350     },
    351     {
    352       "title": "Retrieval-based Neural Source Code Summarization",
    353       "authors": ["Jian Zhang", "Xu Wang", "Hongyu Zhang", "Hailong Sun", "Xudong Liu"],
    354       "year": 2020,
    355       "relevance": "Retrieval-augmented approach for code summarization (Rencos), directly relevant to hybrid code generation methods evaluation."
    356     },
    357     {
    358       "title": "Retrieve and Refine: Exemplar-based Neural Comment Generation",
    359       "authors": ["Bolin Wei", "Yongmin Li", "Ge Li", "Xin Xia", "Zhi Jin"],
    360       "year": 2020,
    361       "relevance": "Retrieval-augmented code comment generation method (Re2com), relevant to evaluation methodology in code summarization."
    362     },
    363     {
    364       "title": "A Transformer-based Approach for Source Code Summarization",
    365       "authors": ["Wasi Ahmad", "Saikat Chakraborty", "Baishakhi Ray", "Kai-Wei Chang"],
    366       "year": 2020,
    367       "relevance": "Transformer-based code summarization approach, directly relevant to deep learning methods for code understanding."
    368     },
    369     {
    370       "title": "Summarizing Source Code Using a Neural Attention Model",
    371       "authors": ["Srinivasan Iyer", "Ioannis Konstas", "Alvin Cheung", "Luke Zettlemoyer"],
    372       "year": 2016,
    373       "relevance": "Pioneering work (CODE-NN) on neural code summarization, relevant as a baseline method in code generation research."
    374     },
    375     {
    376       "title": "Retrieval-Augmented Generation for Code Summarization via Hybrid GNN",
    377       "authors": ["Shangqing Liu", "Yu Chen", "Xiaofei Xie", "Jing Kai Siow", "Yang Liu"],
    378       "year": 2020,
    379       "relevance": "Hybrid retrieval-augmented approach using graph neural networks for code summarization, relevant to the intersection of retrieval and deep learning for code."
    380     },
    381     {
    382       "title": "Deep Code Comment Generation with Hybrid Lexical and Syntactical Information",
    383       "authors": ["Xing Hu", "Ge Li", "Xin Xia", "David Lo", "Zhi Jin"],
    384       "year": 2020,
    385       "relevance": "Hybrid code comment generation method (Hybrid-DeepCom) combining lexical and syntactic information, a key baseline in code summarization evaluation."
    386     },
    387     {
    388       "title": "ComFormer: Code Comment Generation via Transformer and Fusion Method-based Hybrid Code Representation",
    389       "authors": ["Guang Yang", "Xiang Chen", "Jinqiu Cao", "Shuhan Xu", "Zhanqi Cui", "Chi Yu", "Ke Liu"],
    390       "year": 2021,
    391       "relevance": "Transformer-based code comment generation method using fusion, by the same research group, relevant to code generation methodology."
    392     }
    393   ]
    394 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs