scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (17802B)
      1 {
      2   "paper": {
      3     "title": "AI-Powered Code Review Enhancing Software Quality with Intelligent Agents",
      4     "authors": ["Ravikanth Konda"],
      5     "year": 2023,
      6     "venue": "International Journal Research of Leading Publication (IJLRP)"
      7   },
      8   "checklist": {
      9     "artifacts": {
     10       "code_released": {
     11         "applies": true,
     12         "answer": false,
     13         "justification": "No source code, repository URL, or archive link is provided anywhere in the paper."
     14       },
     15       "data_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "The paper mentions assembling a benchmark dataset from GitHub and using Defects4J/Bugs.jar but provides no download link or release of the assembled dataset."
     19       },
     20       "environment_specified": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No environment specifications, dependency lists, or software versions are provided."
     24       },
     25       "reproduction_instructions": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No reproduction instructions, scripts, or step-by-step procedures are included."
     29       }
     30     },
     31     "statistical_methodology": {
     32       "confidence_intervals_or_error_bars": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Results are reported as point estimates (e.g., 'precision rates of over 85%', 'recall rates ranged between 78% and 82%') with no confidence intervals or error bars."
     36       },
     37       "significance_tests": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper makes comparative claims about tool performance but provides no statistical significance tests."
     41       },
     42       "effect_sizes_reported": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No effect sizes are reported. Numbers like '35% decrease' and '28% decreased' are given without baseline context or formal effect size measures."
     46       },
     47       "sample_size_justified": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The survey mentions 20 developers but provides no justification for this sample size. The benchmark dataset size is never specified."
     51       },
     52       "variance_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No variance, standard deviation, or spread measures are reported for any results."
     56       }
     57     },
     58     "evaluation_design": {
     59       "baselines_included": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "Multiple AI tools are compared against each other but there is no baseline comparison (e.g., manual-only review, rule-based-only static analysis). No prior work is used as a quantitative baseline."
     63       },
     64       "baselines_contemporary": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No formal baselines are included, so contemporaneity cannot be assessed."
     68       },
     69       "ablation_study": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No ablation study is conducted on any of the tools or approaches."
     73       },
     74       "multiple_metrics": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper reports precision, recall, F1 score, latency, and reviewer agreement as metrics (Section III-IV)."
     78       },
     79       "human_evaluation": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "20 developers provided feedback on the AI suggestions via structured interviews and surveys (Section III). 72% rated suggestions as 'useful' or 'very useful' (Section IV)."
     83       },
     84       "held_out_test_set": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No mention of train/test splits or held-out evaluation sets. The experimental design is described too vaguely to determine data separation."
     88       },
     89       "per_category_breakdown": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Section IV provides per-tool performance breakdowns and notes that different tools performed better on different defect types (semantic bugs vs. stylistic issues)."
     93       },
     94       "failure_cases_discussed": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Section V discusses limitations including inability to understand domain-specific logic, business rules, and architectural intent. Developers noted pedantic suggestions and missed architectural concerns."
     98       },
     99       "negative_results_reported": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "Every experiment shows positive results. No failed approaches, configurations that didn't work, or negative findings are reported."
    103       }
    104     },
    105     "claims_and_evidence": {
    106       "abstract_claims_supported": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "The abstract claims 'a detailed examination' and results from 'experimental assessment,' but the results section provides only vague aggregate numbers without tables, figures, or detailed data to support the claims."
    110       },
    111       "causal_claims_justified": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "The paper claims AI tools caused '35% decrease' in turnaround time and '28%' decrease in error density, but the study design (no controlled comparison, no randomization of teams) is inadequate for causal inference."
    115       },
    116       "generalization_bounded": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The title and abstract claim broad applicability ('enhancing software quality') but the study tests only a few tools on unspecified datasets. No explicit bounding of generalization claims."
    120       },
    121       "alternative_explanations_discussed": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "No alternative explanations are discussed for the observed improvements. Confounds like novelty effect, Hawthorne effect, or selection bias in the developer sample are not considered."
    125       }
    126     },
    127     "setup_transparency": {
    128       "model_versions_specified": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "Tools are named (ReviewBot, DeepCode, Codex, CodeBERT, DeepBugs) but no specific versions, model sizes, or snapshot dates are provided."
    132       },
    133       "prompts_provided": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper uses LLM-based tools (Codex, CodeBERT) but provides no prompts or input specifications."
    137       },
    138       "hyperparameters_reported": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "No hyperparameters (temperature, sampling settings, learning rates) are reported for any tool."
    142       },
    143       "scaffolding_described": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "The paper evaluates existing third-party tools as black boxes; authors cannot be expected to describe internal scaffolding."
    147       },
    148       "data_preprocessing_documented": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "The paper vaguely mentions assembling a dataset from GitHub repositories 'sampled in a way that represents a balance between clean code and buggy code' but provides no specific filtering criteria, dataset size, or preprocessing steps."
    152       }
    153     },
    154     "limitations_and_scope": {
    155       "limitations_section_present": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "There is no dedicated limitations or threats-to-validity section. Some limitations are mentioned in the Discussion (Section V) but not in a structured subsection."
    159       },
    160       "threats_to_validity_specific": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "The discussion mentions generic limitations (contextual understanding, trust, scalability) but these are about AI tools generally, not specific threats to this study's validity."
    164       },
    165       "scope_boundaries_stated": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "No explicit statements about what the results do not show or what settings/populations are excluded."
    169       }
    170     },
    171     "data_integrity": {
    172       "raw_data_available": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No raw data is available. Results are presented only as vague aggregate numbers in prose (no tables or figures with data)."
    176       },
    177       "data_collection_described": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "Data collection is described only vaguely: 'code snippets, pull requests, and their corresponding review comments from open-source repositories on GitHub.' No specific repositories, time period, or inclusion criteria are given."
    181       },
    182       "recruitment_methods_described": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The paper mentions 20 developers 'working in different companies and open-source platforms' but does not describe how they were recruited or selected."
    186       },
    187       "data_pipeline_documented": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No data pipeline is documented. There are unexplained jumps from 'we assembled a benchmark dataset' to aggregate results with no intermediate steps."
    191       }
    192     },
    193     "conflicts_of_interest": {
    194       "funding_disclosed": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No funding source or acknowledgments section is present in the paper."
    198       },
    199       "affiliations_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "The author is identified as 'Senior Software Developer' with a Gmail address. No institutional affiliation is hidden."
    203       },
    204       "funder_independent_of_outcome": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not absence of conflict."
    208       },
    209       "financial_interests_declared": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No competing interests or financial interest statement is present in the paper."
    213       }
    214     },
    215     "contamination": {
    216       "training_cutoff_stated": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "The paper uses pre-trained models (Codex, CodeBERT) on benchmark data but does not state any training data cutoff dates."
    220       },
    221       "train_test_overlap_discussed": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No discussion of whether the benchmark code snippets could have appeared in the training data of the LLMs used."
    225       },
    226       "benchmark_contamination_addressed": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "Defects4J and Bugs.jar are well-known public benchmarks that predate the models used. No contamination risk is discussed."
    230       }
    231     },
    232     "human_studies": {
    233       "pre_registered": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No pre-registration is mentioned for the developer survey/interview study."
    237       },
    238       "irb_or_ethics_approval": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No IRB or ethics board approval is mentioned despite collecting data from 20 human participants."
    242       },
    243       "demographics_reported": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "The 20 developers are described only as 'working in different companies and open-source platforms.' No experience levels, demographics, or other characterization is provided."
    247       },
    248       "inclusion_exclusion_criteria": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No inclusion or exclusion criteria for participant selection are stated."
    252       },
    253       "randomization_described": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "This is not a randomized experimental study comparing treatment conditions for participants; it is a feedback/survey study."
    257       },
    258       "blinding_described": {
    259         "applies": true,
    260         "answer": true,
    261         "justification": "Section III states 'blinded some of the participants to whether a suggestion was AI- or human-generated.'"
    262       },
    263       "attrition_reported": {
    264         "applies": true,
    265         "answer": false,
    266         "justification": "No information on whether all 20 developers completed the study or if any dropped out."
    267       }
    268     },
    269     "cost_and_practicality": {
    270       "inference_cost_reported": {
    271         "applies": true,
    272         "answer": false,
    273         "justification": "Latency is mentioned ('less than three seconds') but no API costs, token counts, or cost per review are reported."
    274       },
    275       "compute_budget_stated": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "No computational budget, hardware specifications, or total compute used is reported."
    279       }
    280     }
    281   },
    282   "claims": [
    283     {
    284       "claim": "AI tools achieve precision rates of over 85% and recall rates between 78-82% for defect detection.",
    285       "evidence": "Section IV states these numbers but provides no tables, no dataset sizes, no per-tool breakdown of precision/recall, and no statistical tests.",
    286       "supported": "weak"
    287     },
    288     {
    289       "claim": "ReviewBot achieved an F1 score of 0.84 for defect detection.",
    290       "evidence": "Stated in Section IV prose without supporting data tables or methodology details.",
    291       "supported": "weak"
    292     },
    293     {
    294       "claim": "AI-powered reviews reduced review turnaround time by 35%.",
    295       "evidence": "Stated in Section IV without specifying baseline conditions, team sizes, project types, or statistical analysis.",
    296       "supported": "unsupported"
    297     },
    298     {
    299       "claim": "Error density for merged pull requests decreased by about 28% after integrating AI tools.",
    300       "evidence": "Stated in Section IV without specifying measurement methodology, time period, or controlling for confounds.",
    301       "supported": "unsupported"
    302     },
    303     {
    304       "claim": "In 72% of instances, AI suggestions were rated as 'useful' or 'very useful' by developers.",
    305       "evidence": "Section IV, from 20 developer interviews/surveys. No breakdown of rating scale, no inter-rater reliability.",
    306       "supported": "weak"
    307     }
    308   ],
    309   "methodology_tags": ["benchmark-eval", "qualitative"],
    310   "key_findings": "The paper claims AI-powered code review tools (ReviewBot, DeepCode, Codex, CodeBERT, DeepBugs) achieve high precision (>85%) and recall (78-82%) for defect detection, reduce review turnaround time by 35%, and decrease error density by 28%. Developer feedback from 20 participants was generally positive, with 72% rating AI suggestions as useful. However, all results are presented as vague prose assertions without data tables, statistical analysis, or reproducible methodology.",
    311   "red_flags": [
    312     {
    313       "flag": "Predatory journal",
    314       "detail": "Published in IJLRP (International Journal Research of Leading Publication), which shows hallmarks of a predatory journal: generic name, Gmail-based author contact, no institutional affiliation, minimal peer review evidence."
    315     },
    316     {
    317       "flag": "No data tables or figures with results",
    318       "detail": "All quantitative results are stated only in prose paragraphs. No data tables, no result figures, no raw numbers. This makes verification impossible."
    319     },
    320     {
    321       "flag": "Suspiciously clean results",
    322       "detail": "All tools perform well, all metrics are positive, no negative results are reported. Every number supports the narrative without exception."
    323     },
    324     {
    325       "flag": "Vague methodology with specific-sounding numbers",
    326       "detail": "The methodology section is too vague to reproduce (no dataset sizes, no tool versions, no hyperparameters), yet the results section presents precise numbers (85%, 0.84, 35%, 28%) without supporting evidence."
    327     },
    328     {
    329       "flag": "Unverifiable references",
    330       "detail": "Several references (e.g., ReviewBot [6], StatixAI [5]) do not appear to correspond to real published papers. The arXiv IDs and publication details may be fabricated."
    331     },
    332     {
    333       "flag": "No statistical rigor",
    334       "detail": "No confidence intervals, significance tests, effect sizes, variance measures, or sample size justification for any claim."
    335     }
    336   ],
    337   "cited_papers": [
    338     {
    339       "title": "AUGER: Automatically Generating Review Comments with Pre-trained Language Models",
    340       "authors": ["L. Li", "A. Chen", "M. Zhou", "H. Xu"],
    341       "year": 2022,
    342       "arxiv_id": "2208.08014",
    343       "relevance": "Directly addresses AI-generated code review comments using pre-trained language models."
    344     },
    345     {
    346       "title": "DeepBugs: A Learning Approach to Name-Based Bug Detection",
    347       "authors": ["M. Pradel", "K. Sen"],
    348       "year": 2018,
    349       "relevance": "Foundational work on deep-learning-based bug detection relevant to AI code quality tools."
    350     },
    351     {
    352       "title": "A Survey of Machine Learning for Big Code and Naturalness",
    353       "authors": ["M. Allamanis", "E. T. Barr", "P. Devanbu", "C. Sutton"],
    354       "year": 2019,
    355       "relevance": "Comprehensive survey of ML applied to source code, foundational reference for the field."
    356     },
    357     {
    358       "title": "CodeBERT: A Pre-trained Model for Programming and Natural Languages",
    359       "authors": ["S. Wang", "A. Tiwari", "T. White"],
    360       "year": 2021,
    361       "relevance": "Pre-trained model for code understanding used in code review and generation tasks."
    362     }
    363   ]
    364 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs