scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21784B)
      1 {
      2   "paper": {
      3     "title": "A Comparative Review of AI Techniques for Automated Code Generation in Software Development: Advancements, Challenges, and Future Directions",
      4     "authors": ["Ayman Odeh", "Nada Odeh", "Abdul Salam Mohammed"],
      5     "year": 2024,
      6     "venue": "TEM Journal",
      7     "doi": "10.18421/TEM131-76"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No code, analysis scripts, or repository URLs are provided anywhere in the paper. A survey can release analysis code or data extraction scripts, but this one does not."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No dataset or corpus of reviewed papers is released. The paper does not provide a downloadable list of reviewed studies, search queries, or extracted data tables."
     20       },
     21       "environment_specified": {
     22         "applies": false,
     23         "answer": false,
     24         "justification": "This is a narrative literature review with no computational experiments, so environment specifications are structurally inapplicable."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No instructions are provided for reproducing the literature search, selection, or comparative analysis. The methodology section (Section 5.1) describes the general approach at a high level but gives no reproducible protocol."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": false,
     35         "answer": false,
     36         "justification": "This is a narrative review paper that does not conduct quantitative experiments or meta-analysis, so confidence intervals are structurally inapplicable."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "No statistical comparisons are performed. The paper is a qualitative review, not a quantitative meta-analysis."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No effect sizes are applicable since the paper does not perform quantitative synthesis of results across studies."
     47       },
     48       "sample_size_justified": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No sample size or statistical analysis is involved. This is a narrative literature review."
     52       },
     53       "variance_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No experimental runs or quantitative aggregation is performed, so variance reporting is inapplicable."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The paper does not compare itself against prior surveys or reviews on the same topic. While it reviews AI techniques, it does not position its own review methodology against prior review approaches."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper's references are notably dated. Most cited works are from 2017-2022, with very few from 2023. Key contemporary systems like GPT-4, CodeLlama, StarCoder, and other state-of-the-art code generation models available before the submission date (October 2023) are absent."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "As a review paper with no system or method of its own, ablation studies are structurally inapplicable."
     74       },
     75       "multiple_metrics": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "The paper does not run any experiments, so evaluation metrics for its own methodology are not applicable."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Human evaluation is not applicable to a narrative literature review that makes no system-output claims."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No experiments are performed, so held-out test sets are inapplicable."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 6 provides a per-technique breakdown of strengths, weaknesses, and performance for TB, RB, DL, EAs, and NLP approaches. Table 5 lists evaluation criteria. Figure 2 shows frequency of criteria used."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4 discusses 14 specific challenges and limitations of AI-based code generation, including lack of contextual understanding, overfitting, scalability issues, and difficulty handling ambiguities."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports weaknesses for each technique in Table 6 (e.g., DL 'demands large amounts of high-quality training data', EAs have 'high computational overhead', NLP 'not as effective for generating code for complex tasks')."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims to provide a 'comprehensive review and discussion of traditional and AI techniques used for ACG, their challenges, and limitations' and to perform a 'comparative result for AI methods.' The paper does deliver sections on each of these (Sections 2-6, Table 6)."
    111       },
    112       "causal_claims_justified": {
    113         "applies": false,
    114         "answer": false,
    115         "justification": "The paper does not make causal claims. It describes and categorizes AI techniques without claiming that any intervention causes a specific outcome."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes broad claims about AI techniques for code generation without bounding the scope. The title claims to cover 'AI Techniques for Automated Code Generation' generally, but the coverage is selective and heavily skewed toward older methods. Major contemporary systems (GPT-4, CodeLlama, StarCoder) are entirely absent despite being available before submission."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": false,
    124         "answer": false,
    125         "justification": "This is a narrative literature review that presents no empirical results of its own. Alternative explanations are not applicable since there are no findings to explain."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "The paper does not use or evaluate any AI models directly. It is a literature review, so model version specification is inapplicable."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No prompting is used in this paper. It is a literature review."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No experiments are conducted, so hyperparameter reporting is inapplicable."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. This is a literature review."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The methodology section (5.1) says 'we carefully gathered data from the research articles mentioned in the reference section' but provides no search terms, databases queried, inclusion/exclusion criteria, date ranges, or filtering pipeline with counts. The paper selection process is entirely opaque."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated limitations or threats-to-validity section. The paper discusses limitations of the reviewed AI techniques (Section 4) but never discusses limitations of its own review methodology."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity of the review itself are discussed. There is no acknowledgment of selection bias, coverage gaps, or methodological limitations of the survey approach."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit scope boundaries are stated. The paper does not specify what is excluded from the review, what time period it covers, or what types of code generation it does not address."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data (e.g., the corpus of reviewed papers, extracted data points, search results) is made available for independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "Section 5.1 states 'we carefully gathered data from the research articles mentioned in the reference section' but does not describe how these articles were found, what databases were searched, what search terms were used, or what time period was covered."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are involved. The paper reviews published literature, not human subjects."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No data pipeline is documented. There is no description of how papers were identified, screened, selected, or how data was extracted from them. A PRISMA-style flow diagram or equivalent is absent."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section mentioning grants or sponsors."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly stated: Al Ain University (UAE) and Skyline University College (Dubai, UAE)."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding statement is itself a disclosure gap."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This is a literature review that does not evaluate any pre-trained model on a benchmark. Contamination concerns are structurally inapplicable."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No model evaluation is performed. Contamination concerns are inapplicable to a narrative literature review."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No benchmarks are run. This is a survey paper."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this literature review."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "This is a survey paper. Cost reporting for its own method is inapplicable."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "This is a survey paper with no computational experiments."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "AI techniques for automated code generation have shown significant advancements in generating code, improving efficiency, and enhancing code quality.",
    286       "evidence": "Conclusion (Section 7) and discussion throughout Sections 2-3. No quantitative evidence is provided; the claim rests on summarizing individual papers' reported results.",
    287       "supported": "weak"
    288     },
    289     {
    290       "claim": "Deep learning techniques excel in capturing complex patterns and generating code with improved accuracy compared to traditional methods.",
    291       "evidence": "Section 2.5 and Table 6 comparative analysis. The claim is based on qualitative descriptions of DL strengths from cited papers, not a systematic quantitative comparison conducted by the authors.",
    292       "supported": "weak"
    293     },
    294     {
    295       "claim": "NLP-based code generation can be effective for simple tasks but is not as effective for complex tasks.",
    296       "evidence": "Table 6 lists this as a performance characteristic of NLP. No systematic evidence or quantitative comparison is provided to support this specific boundary between simple and complex tasks.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "Evolutionary algorithms are effective for optimization-based code generation tasks.",
    301       "evidence": "Table 6 and Section 2.6 describe EA strengths. The evidence is drawn from cited papers [37], [48], [49] but no systematic comparison is performed by the authors.",
    302       "supported": "weak"
    303     }
    304   ],
    305   "methodology_tags": ["meta-analysis"],
    306   "key_findings": "This paper provides a narrative review of AI techniques for automated code generation (ACG), categorizing approaches into rule-based, template-based, machine learning, deep learning, NLP, and evolutionary algorithms. The comparative analysis (Table 6) identifies qualitative strengths and weaknesses of each approach category. The paper catalogs 13 evaluation criteria commonly used in the reviewed literature and proposes 17 future research directions. However, the review methodology is not systematic and the coverage of contemporary work (post-2022 LLMs) is notably absent.",
    307   "red_flags": [
    308     {
    309       "flag": "No systematic review methodology",
    310       "detail": "The paper claims to be a 'comparative review' but does not follow any systematic review protocol (e.g., PRISMA). Section 5.1 describes the methodology as 'carefully gathered data from the research articles mentioned in the reference section' without specifying search terms, databases, inclusion/exclusion criteria, or a filtering pipeline. The paper selection process is entirely opaque."
    311     },
    312     {
    313       "flag": "Major coverage gaps in contemporary work",
    314       "detail": "Despite being published in February 2024 (submitted October 2023), the paper almost entirely omits the LLM revolution in code generation. GPT-4, CodeLlama, StarCoder, WizardCoder, CodeGen, and other major models available before October 2023 are not discussed. GitHub Copilot is mentioned only in passing. The most recent substantive references are from 2022-2023 but cover older topics."
    315     },
    316     {
    317       "flag": "No quality assessment of reviewed studies",
    318       "detail": "The survey summarizes claims from reviewed papers without any quality assessment of those studies. It does not evaluate the methodological rigor of the papers it synthesizes, effectively laundering the signal-to-noise ratio of its sources."
    319     },
    320     {
    321       "flag": "Qualitative comparisons presented as findings",
    322       "detail": "Table 6 presents a comparative analysis of AI techniques, but the comparisons are entirely qualitative (e.g., 'High accuracy in generating contextually rich code'). No quantitative metrics, benchmark results, or systematic evidence is aggregated across studies."
    323     },
    324     {
    325       "flag": "No limitations section for the review itself",
    326       "detail": "While Section 4 discusses limitations of AI-based code generation techniques, the paper never discusses limitations of its own review methodology, including potential selection bias, coverage gaps, or the absence of systematic search procedures."
    327     },
    328     {
    329       "flag": "Duplicate content in challenge listing",
    330       "detail": "Section 4 lists 'Lack of Sufficient Training Data' as challenge #1 and 'Limited Training Data' as challenge #3, which are essentially the same issue, suggesting superficial organization."
    331     }
    332   ],
    333   "cited_papers": [
    334     {
    335       "title": "Code generation using machine learning: A systematic review",
    336       "authors": ["E. Dehaerne", "B. Dey", "S. Halder", "S. De Gendt", "W. Meert"],
    337       "year": 2022,
    338       "doi": "10.1109/ACCESS.2022.3196347",
    339       "relevance": "Systematic review of ML for code generation with structured methodology, directly comparable to the survey scope."
    340     },
    341     {
    342       "title": "Deep learning for source code modeling and generation: Models, applications, and challenges",
    343       "authors": ["T. H. Le", "H. Chen", "M. A. Babar"],
    344       "year": 2020,
    345       "doi": "10.1145/3383458",
    346       "relevance": "Survey on deep learning approaches to source code modeling, relevant to understanding the methodological landscape of code generation research."
    347     },
    348     {
    349       "title": "A survey of machine learning for big code and naturalness",
    350       "authors": ["M. Allamanis", "E. T. Barr", "P. Devanbu", "C. Sutton"],
    351       "year": 2018,
    352       "doi": "10.1145/3212695",
    353       "relevance": "Foundational survey on ML for code, widely cited and relevant as a baseline for comparing subsequent survey quality."
    354     },
    355     {
    356       "title": "Competition-level code generation with AlphaCode",
    357       "authors": ["Y. Li", "D. Choi", "J. Chung", "N. Kushman", "J. Schrittwieser", "R. Leblond", "O. Vinyals"],
    358       "year": 2022,
    359       "doi": "10.1126/SCIENCE.ABQ1158",
    360       "relevance": "Major benchmark evaluation paper showing AI achieving competitive programming performance, directly relevant to code generation capability assessment."
    361     },
    362     {
    363       "title": "Deep Learning Based Code Generation Methods: A Literature Review",
    364       "authors": ["Z. Yang", "S. Chen", "C. Gao", "Z. Li", "G. Li", "R. Lv"],
    365       "year": 2023,
    366       "arxiv_id": "2303.01056",
    367       "relevance": "Recent literature review on DL-based code generation, directly relevant for comparing review methodologies and coverage."
    368     },
    369     {
    370       "title": "Assessing the quality of GitHub Copilot's code generation",
    371       "authors": ["B. Yetistiren", "I. Ozsoy", "E. Tuzun"],
    372       "year": 2022,
    373       "doi": "10.1145/3558489.3559072",
    374       "relevance": "Empirical evaluation of AI code generation quality from GitHub Copilot, relevant to understanding evaluation methodology in the field."
    375     },
    376     {
    377       "title": "A survey on machine learning techniques for source code analysis",
    378       "authors": ["T. Sharma", "M. Kechagia", "S. Georgiou", "R. Tiwari", "I. Vats", "H. Moazen", "F. Sarro"],
    379       "year": 2021,
    380       "relevance": "Survey covering ML techniques for source code, relevant for comparing survey methodological quality."
    381     },
    382     {
    383       "title": "Programming is hard-or at least it used to be: Educational opportunities and challenges of AI code generation",
    384       "authors": ["B. A. Becker", "P. Denny", "J. Finnie-Ansley", "A. Luxton-Reilly", "J. Prather", "E. A. Santos"],
    385       "year": 2023,
    386       "doi": "10.1145/3545945.3569759",
    387       "relevance": "Discusses educational implications of AI code generation tools, relevant to the broader impact assessment in the survey scope."
    388     },
    389     {
    390       "title": "A comprehensive survey on program synthesis with evolutionary algorithms",
    391       "authors": ["D. Sobania", "D. Schweim", "F. Rothlauf"],
    392       "year": 2022,
    393       "doi": "10.1109/TEVC.2022.3162324",
    394       "relevance": "Comprehensive survey on evolutionary approaches to program synthesis, relevant for comparing survey quality and coverage."
    395     },
    396     {
    397       "title": "Improving ChatGPT Prompt for Code Generation",
    398       "authors": ["C. Liu", "X. Bao", "H. Zhang", "N. Zhang", "H. Hu", "X. Zhang", "M. Yan"],
    399       "year": 2023,
    400       "arxiv_id": "2305.08360",
    401       "relevance": "Addresses prompt engineering for LLM-based code generation, relevant to the survey scope on AI techniques for code generation."
    402     }
    403   ]
    404 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs