scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26263B)
      1 {
      2   "paper": {
      3     "title": "AUTOGENICS: Automated Generation of Context-Aware Inline Comments for Code Snippets on Programming Q&A Sites Using LLM",
      4     "authors": ["Suborno Deb Bappon", "Saikat Mondal", "Banani Roy"],
      5     "year": 2024,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2408.15411"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "A replication package is provided at https://github.com/replication-pckg/AUTOGENICS (reference [35]), and the paper states 'Replication Package available in our online appendix [35]'."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses data collected from Stack Overflow via the StackExchange Data API (reference [17]), which is publicly available. The replication package [35] is stated to contain the data."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications, requirements.txt, Dockerfile, or detailed dependency lists are provided in the paper. The paper mentions Flask, LangChain, and Gemini API but does not provide version-specific environment setup information."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper describes the methodology at a high level but does not provide step-by-step reproduction instructions or scripts to replicate the main experiments. The reader would have to piece together the process from the methodology section."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results are reported as mean and median values in tables (Tables V, VI, VIII) but no confidence intervals or error bars are provided."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims AUTOGENICS-generated comments 'outperform' standard LLM comments and that context-aware comments show 'significant enhancements', but no statistical significance tests (e.g., Wilcoxon, t-test) are reported to support these comparative claims."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "While mean score differences can be computed from Tables V and VIII (e.g., accuracy improving from 4.4 to 4.8 with context), no formal effect sizes (Cohen's d, etc.) are reported. The raw differences are small (0.2-0.6 on a 5-point scale) and their practical significance is not assessed."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper states 'this sample size is statistically significant with a 95% confidence level and a 5% error margin [37, 38]' for the 400 code snippets. However, the survey sample of 14 participants is not similarly justified."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations, interquartile ranges, or other spread measures are reported. Tables show only mean and median values."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares AUTOGENICS (context-aware + noise filtering) against standard LLM generation without context (Table VIII), and also compares Gemini 1.5 Pro vs GPT-4 (Table VI)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Gemini 1.5 Pro and GPT-4 were state-of-the-art LLMs at the time of the study (2024). The paper does not compare against prior non-LLM comment generation tools, but the focus is on LLM-based generation."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The comparison between standard LLM (without context) and AUTOGENICS (with context + noise filtering) in Table VIII functions as a partial ablation. However, context and noise filtering are not ablated separately — they are combined."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Four evaluation metrics are used: accuracy, adequacy, conciseness, and usefulness (Table II)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Two authors manually evaluated 400 code snippets across four metrics (Section III-C, 100 person-hours). Additionally, 14 SO users participated in a survey evaluating comment effectiveness (Section III-D, Table VII)."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is not a machine learning training/testing study. The evaluation is manual human assessment of generated comments, so held-out test sets are not structurally applicable."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by programming language (Python vs Java) and by quartile of code snippet length (Q1-Q4) in all main tables (Tables V, VI, VII, VIII)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses that 'LLMs-generated comments are less effective for shorter code snippets and sometimes produce noisy comments' (Abstract), and Section VII discusses the 'Dilemma of Clarity Vs. Conciseness' as a limitation."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that comments are less effective for shorter code snippets (Q1), that conciseness decreases for longer snippets, and that LLMs sometimes generate noisy comments for import statements and variable declarations. These motivated the noise filtering mechanism."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims LLMs show 'promising effectiveness', comments are 'less effective for shorter code snippets', and AUTOGENICS comments 'outperform those of standard LLMs'. These are supported by Tables V, VIII, and the discussion sections, though the 'outperform' claim lacks statistical testing."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper claims AUTOGENICS comments are more effective because of additional context and noise filtering, which is a causal claim. However, context and noise filtering are applied together, making it impossible to attribute improvement to either component. No controlled ablation separates these two mechanisms."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper tests on Python and Java SO snippets only, but the title says 'Programming Q&A Sites Using LLM' (broader than SO). The abstract claims AUTOGENICS 'might enhance code comprehension' and 'improve developers' ability to learn and reuse code more accurately' without bounding these claims to the tested setting. The conclusion mentions plans to 'explore the effectiveness of AUTOGENICS across different programming languages and Q&A platforms', implicitly acknowledging the current scope but not explicitly bounding claims."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The threats to validity section discusses generalizability and evaluation bias but does not consider alternative explanations for the observed results. For example, it does not consider whether the improvement from context could be due to longer/more detailed prompts rather than contextual information itself, or whether the noise filtering alone accounts for most improvement."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper says 'Gemini 1.5 Pro' and 'GPT-4' but provides no snapshot dates, API versions, or specific model identifiers (e.g., 'gpt-4-0613'). For GPT-4, the reference [34] just links to https://openai.com/index/gpt-4/."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The paper provides three complete prompts: the inline comment generation prompt for Gemini (Section III-B), the question context extraction prompt (Section III-E), and the context-aware inline comment generation prompt (Section III-E). These include actual text used, not just descriptions."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No temperature, top-p, max tokens, or other API hyperparameters are reported for either Gemini 1.5 Pro or GPT-4."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The AUTOGENICS architecture is described in detail in Section III-E with Figure 4: browser extension content script, background script, Flask server, LangChain integration, two-step prompting (context extraction then comment generation), and noise filtering mechanism."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section III-A describes the data pipeline: collection from StackExchange API, filtering to accepted answers with single code snippets (2,963,498 -> 864,077), quartile analysis by LOC, and stratified random sampling of 50 per quartile per language (400 total). Table I provides counts at each stage."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section VIII 'Threats to Validity' discusses external validity, internal validity, construct validity, and statistical conclusion threats across multiple paragraphs."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The threats section includes study-specific concerns: the survey sample of 14 developers and potential snowball sampling bias, the use of specific language models (Gemini and GPT-4), and that results are limited to Java and Python SO answers. The inter-rater agreement (kappa=0.94) is cited to mitigate bias concerns."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "While the threats section mentions results 'may not be generalized to all SO answers code snippets', it does not explicitly state what the results do NOT show. The paper does not bound its claims about code comprehension improvement, developer productivity gains, or educational benefits. The conclusion's future work mention of other languages/platforms implicitly acknowledges scope but does not explicitly delineate boundaries."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "A replication package is provided at the GitHub URL [35], and the underlying SO data is from the publicly accessible StackExchange Data API [17]."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section III-A describes data collection in detail: answers from StackExchange Data API, posted on or before February 2024, restricted to accepted answers with single code snippets for Python and Java. Exact counts are provided in Table I."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section III-D describes participant recruitment: snowball approach starting from known software developers, plus open circulars on Facebook groups and LinkedIn targeting professional Python/Java developers. Table III provides demographic breakdown."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline is documented: 2,963,498 total answers -> 864,077 accepted answers with single code snippets -> quartile division -> stratified random sampling of 400 -> inline comment generation -> manual evaluation. Figure 3 shows the methodology schematic."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The acknowledgment section states: 'This research is supported in part by the industry-stream NSERC CREATE in Software Analytics Research (SOAR).'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors are affiliated with the Department of Computer Science, University of Saskatchewan, Canada. No commercial affiliation with Google (Gemini) or OpenAI (GPT-4) is present."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "NSERC CREATE SOAR is a Canadian government research training grant. The funder has no financial interest in whether LLM-generated inline comments are effective."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is provided in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This paper does not evaluate model capability on a benchmark. It uses LLMs to generate inline comments and then manually evaluates the quality of those comments. There is no benchmark contamination concern — the task is generation quality assessment, not benchmark performance."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No benchmark evaluation is performed. The paper generates comments and evaluates them manually, so train/test overlap is not applicable."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No benchmark is used. The evaluation is manual assessment of generated inline comments, not model performance on a standardized benchmark."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No pre-registration of the study or survey is mentioned."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No IRB or ethics board approval is mentioned. The paper states they 'consider ethical issues from the established best practices [51, 52]' including consent and confidentiality, but does not mention formal ethics review."
    243       },
    244       "demographics_reported": {
    245         "applies": true,
    246         "answer": true,
    247         "justification": "Table III reports participant demographics: development experience (years in ranges), profession (SW Developer, Academician, Student), and frequency of SO usage (Daily, Weekly, Monthly). Countries are mentioned (Canada, Bangladesh)."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": true,
    251         "answer": true,
    252         "justification": "Section III-D states: 'participants must confirm their consent, agree to data processing, be familiar with the SO, and have experience in programming languages (e.g., Python or Java).' These are explicit eligibility criteria."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "This is a survey study, not a randomized experiment. Participants all evaluated the same type of content (LLM-generated comments). There are no treatment/control conditions requiring randomization."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "This is a cross-sectional survey where participants evaluate comment quality. Blinding is not applicable as there are no experimental conditions to blind."
    263       },
    264       "attrition_reported": {
    265         "applies": true,
    266         "answer": true,
    267         "justification": "The paper states they recruited 14 participants and 'received 14 valid responses', and that pilot survey responses (3 practitioners) 'were excluded from the final analysis'. This accounts for all participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The paper mentions Gemini is 'freely accessible (offers 50 requests per day)' but does not report the total API cost, tokens consumed, or wall-clock time for generating comments for 400 snippets. The 100 person-hours for manual evaluation is mentioned but not API costs."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget is stated. The paper mentions 100 person-hours for manual evaluation but not the computational resources used for LLM inference."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "LLMs (Gemini 1.5 Pro) generate highly accurate inline comments for SO answer code snippets, with mean accuracy ranging from 4.65 to 4.88 out of 5 for Python and 4.70 to 4.86 for Java.",
    286       "evidence": "Table V shows mean accuracy scores across quartiles for both Python and Java, with medians consistently at 5. Section IV provides detailed analysis.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "LLM-generated inline comments are less effective for shorter code snippets and more effective for longer ones.",
    291       "evidence": "Tables V and VII show an upward trend in accuracy and usefulness from Q1 to Q3, with a slight decline in Q4. This is observed for both Python and Java.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Gemini 1.5 Pro performs comparably to or better than GPT-4 for inline comment generation.",
    296       "evidence": "Table VI shows GPT-4 results on 40 snippets. Gemini scores are generally equal or higher on accuracy and conciseness, and notably higher on adequacy and usefulness. However, the comparison is on only 40 snippets with no statistical test.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "AUTOGENICS-generated context-aware comments outperform standard LLM-generated comments across all four metrics.",
    301       "evidence": "Table VIII compares without-context vs with-context scores on 40 snippets. Mean scores improve across most quartiles for accuracy, adequacy, and usefulness, though improvements are small (0.2-0.6 on a 5-point scale). No statistical significance test is applied.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "Approximately 79% of participants expressed strong interest in an automated tool for generating inline comments.",
    306       "evidence": "Figure 5a shows 78.6% 'Very Interested' and 14.3% 'Somewhat Interested'. Based on 14 participants.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "The survey results are consistent with the manual evaluation results.",
    311       "evidence": "Table VII (survey) and Table V (manual) show similar patterns of high accuracy, increasing adequacy/usefulness with code length, and decreasing conciseness. However, consistency is claimed without formal statistical comparison.",
    312       "supported": "moderate"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval", "qualitative"],
    316   "key_findings": "LLMs (Gemini 1.5 Pro) generate inline comments for Stack Overflow code snippets with high accuracy (mean 4.65-4.88/5), with effectiveness increasing for longer code snippets. A survey of 14 SO users corroborated the manual evaluation findings. AUTOGENICS, a browser plugin that incorporates question context and noise filtering, produces comments that score higher than standard LLM-generated comments on all four metrics (accuracy, adequacy, conciseness, usefulness), though improvements are modest and not statistically tested. About 79% of surveyed developers expressed strong interest in such a tool.",
    317   "red_flags": [
    318     {
    319       "flag": "No statistical significance testing",
    320       "detail": "All comparative claims (Gemini vs GPT-4, AUTOGENICS vs standard LLM) are based on comparing mean/median values without any significance tests. The improvements are small (0.2-0.6 on a 5-point Likert scale) and could easily be within random variation."
    321     },
    322     {
    323       "flag": "Very small survey sample",
    324       "detail": "The user study involves only 14 participants. This is too small for reliable statistical inference, yet the paper draws strong conclusions from the survey (e.g., '79% expressed strong interest' is 11 out of 14 people)."
    325     },
    326     {
    327       "flag": "Confounded ablation",
    328       "detail": "AUTOGENICS adds both context-awareness and noise filtering simultaneously. The paper cannot attribute improvements to either component individually, making it impossible to determine which mechanism (or both) drives the observed improvement."
    329     },
    330     {
    331       "flag": "No variance or uncertainty quantification",
    332       "detail": "Tables report only mean and median values. No standard deviations, confidence intervals, or other uncertainty measures are provided, making it impossible to assess the reliability of the reported differences."
    333     },
    334     {
    335       "flag": "Small evaluation subset for key comparisons",
    336       "detail": "The GPT-4 comparison uses only 40 snippets and the AUTOGENICS evaluation uses only 40 snippets (5 per quartile per language). These small subsets weaken the reliability of the comparative claims."
    337     },
    338     {
    339       "flag": "Single evaluator for majority of data",
    340       "detail": "After calibrating on 80 snippets (kappa=0.94), the first author evaluated the remaining 320 snippets alone. While the high kappa mitigates this somewhat, single-rater evaluation introduces potential bias."
    341     }
    342   ],
    343   "cited_papers": [
    344     {
    345       "title": "Large language models are few-shot summarizers: Multi-intent comment generation via in-context learning",
    346       "authors": ["Mingyang Geng", "Shangwen Wang", "Dezun Dong", "Haotian Wang", "Ge Li", "Zhi Jin", "Xiaoguang Mao", "Xiangke Liao"],
    347       "year": 2024,
    348       "relevance": "Directly relevant as an LLM-based code comment generation study using in-context learning."
    349     },
    350     {
    351       "title": "Automatic smart contract comment generation via large language models and in-context learning",
    352       "authors": ["Junjie Zhao", "Xiang Chen", "Guang Yang", "Yiheng Shen"],
    353       "year": 2024,
    354       "relevance": "LLM-based automated documentation generation for smart contracts, related to code comprehension."
    355     },
    356     {
    357       "title": "Prompt engineering or fine tuning: An empirical assessment of large language models in automated software engineering tasks",
    358       "authors": ["Jiho Shin", "Clark Tang", "Tahmineh Mohati", "Maleknaz Nayebi", "Song Wang", "Hadi Hemmati"],
    359       "year": 2023,
    360       "arxiv_id": "2310.10508",
    361       "relevance": "Empirical comparison of prompt engineering vs fine-tuning for LLMs in SE tasks, relevant to methodology of using LLMs."
    362     },
    363     {
    364       "title": "Coditt5: Pretraining for source code and natural language editing",
    365       "authors": ["Jiyang Zhang", "Sheena Panthaplackel", "Pengyu Nie", "Junyi Jessy Li", "Milos Gligoric"],
    366       "year": 2022,
    367       "relevance": "Pre-trained model for source code and natural language editing tasks."
    368     },
    369     {
    370       "title": "Correlating automated and human evaluation of code documentation generation quality",
    371       "authors": ["Xing Hu", "Qiuyuan Chen", "Haoye Wang", "Xin Xia", "David Lo", "Thomas Zimmermann"],
    372       "year": 2022,
    373       "doi": "10.1145/3502853",
    374       "relevance": "Evaluates correlation between automated metrics and human assessment for code documentation, relevant to evaluation methodology."
    375     },
    376     {
    377       "title": "Out of the BLEU: how should we assess quality of the code generation models?",
    378       "authors": ["Mikhail Evtikhiev", "Egor Bogomolov", "Yaroslav Sokolov", "Timofey Bryksin"],
    379       "year": 2023,
    380       "relevance": "Critiques automated metrics for code generation quality, motivating human evaluation approaches."
    381     },
    382     {
    383       "title": "Practitioners' expectations on automated code comment generation",
    384       "authors": ["Xing Hu", "Xin Xia", "David Lo", "Zhiyuan Wan", "Qiuyuan Chen", "Thomas Zimmermann"],
    385       "year": 2022,
    386       "relevance": "User study on developer expectations for automated code commenting tools."
    387     },
    388     {
    389       "title": "DocChecker: Bootstrapping code large language model for detecting and resolving code-comment inconsistencies",
    390       "authors": ["Anh T. V. Dau", "Jin L. C. Guo", "Nghi D. Q. Bui"],
    391       "year": 2024,
    392       "relevance": "LLM-based tool for code-comment consistency, related to automated code documentation quality."
    393     },
    394     {
    395       "title": "Automated test case generation using code models and domain adaptation",
    396       "authors": ["Sepehr Hashtroudi", "Jiho Shin", "Hadi Hemmati", "Song Wang"],
    397       "year": 2023,
    398       "arxiv_id": "2308.08033",
    399       "relevance": "LLM-based automated test generation, relevant to evaluation of LLM capabilities in SE tasks."
    400     },
    401     {
    402       "title": "Enriching source code with contextual data for code completion models: An empirical study",
    403       "authors": ["T. V. Dam", "M. Izadi", "A. Deursen"],
    404       "year": 2023,
    405       "doi": "10.1109/MSR59073.2023.00035",
    406       "relevance": "Studies the effect of contextual enrichment on code completion models, methodologically parallel to AUTOGENICS's context-awareness approach."
    407     }
    408   ]
    409 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs