ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (16859B)


      1 {
      2   "paper": {
      3     "title": "AI Code Review Assistant: A Modern Web Based Solution for Automated Code Analysis and Developer Productivity Enhancement",
      4     "authors": ["Mohanakshi KM", "Sandeep"],
      5     "year": 2025,
      6     "venue": "International Journal for Research in Applied Science & Engineering Technology (IJRASET)",
      7     "doi": "10.22214/ijraset.2025.73682"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No dataset or evaluation data is released. The 75-user beta test data is not made available."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "Technologies are named (Next.js, Firebase, Groq API, Tailwind CSS) but no versions, requirements.txt, or environment setup details are provided."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No reproduction instructions, README, or steps to recreate the system or experiments are provided."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results in Table II are point estimates only (e.g., '92.3%', '4.2/5.0') with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims '34% better accuracy' compared to static analysis tools but provides no statistical test for this comparison."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Raw percentages are reported but without baseline context or formal effect size measures. The '34% better accuracy' claim lacks detail on how this was measured."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "75 beta users are mentioned but no justification for this sample size is given, nor any power analysis."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations, variance, or spread measures are reported for any metric."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The paper claims '34% better accuracy' vs traditional static analysis tools but provides no specific baseline system, no named tool, and no methodology for the comparison."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No specific baseline systems are identified. The comparison to 'traditional static analysis tools' is vague and unsubstantiated."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The system has multiple components (AI integration, thread management, analytics) but no ablation study examines their individual contributions."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table II reports multiple metrics: response time, code analysis accuracy, user satisfaction, system availability, and mobile responsiveness."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "75 beta users provided satisfaction ratings (4.2/5.0 overall) and the paper reports user retention rate (85%). This constitutes human evaluation of the system's outputs."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No mention of how the '92.3% code analysis accuracy' was measured — no test set, no ground truth, no evaluation methodology described."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No per-language, per-issue-type, or per-category breakdown of the accuracy or satisfaction results is provided."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No failure cases, error analysis, or examples of incorrect code reviews are discussed."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "Every result reported is positive. No failed approaches, unsuccessful configurations, or negative findings are mentioned."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "The abstract claims '92% accuracy in code issue detection' but the paper never describes how accuracy was measured, what ground truth was used, or what constitutes a 'code issue.' The claim is unverifiable from the paper's content."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper claims the system 'enhance[s] code quality, reduce[s] review time, and improve[s] overall developer productivity' but provides no controlled study or causal design to support these causal claims."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes broad claims about 'developer productivity enhancement' and 'modern software development environments' based on a 75-user beta test with no description of the user population or programming contexts tested."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations are considered for the results. Novelty effects, selection bias in beta users, or Hawthorne effects are not discussed."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper says 'Groq API' and 'Groq's large language models' but never specifies which model (e.g., Llama-2-70b, Mixtral) or version was used."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper mentions 'specialized prompt engineering for different code review scenarios' but provides no actual prompt text. Only a code snippet showing the API call structure is given."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No hyperparameters (temperature, top-p, max tokens) for the Groq API calls are reported."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper mentions 'context-aware prompt engineering, response parsing, and intelligent fallback mechanisms' but provides no detail on what these actually do or how they work."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No description of how code submissions were processed before being sent to the AI model, or how evaluation data was collected and prepared."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section VI.D 'Limitations and Future Work' exists and mentions dependency on external AI API availability and need for model fine-tuning."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "The limitations are generic ('dependency on external AI API availability', 'need for continuous model fine-tuning'). No specific threats to the validity of the evaluation results are discussed."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit statement of what the results do not show or what settings/populations are excluded. Claims are presented broadly without bounding."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data from the beta test, user surveys, or accuracy evaluation is made available."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "The paper states '6-week testing period with 75 beta users' but does not describe how data was collected, what instruments were used, or what was measured and how."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No description of how the 75 beta users were recruited, from what population, or whether this introduces selection bias."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No documentation of how raw usage data or survey responses were transformed into the reported metrics."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "The acknowledgment section thanks faculty and university but does not explicitly state funding sources or lack thereof."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Authors are listed as affiliated with 'MCA, Navkis College of Engineering, Visvesvaraya Technological University.'"
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "This appears to be an unfunded university student project (MCA degree). No external funder with stake in outcomes."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. It builds a tool using the Groq API and evaluates the tool's usability/accuracy in a deployed setting."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Not applicable — no benchmark evaluation of a pre-trained model's knowledge."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not applicable — no benchmark evaluation of a pre-trained model's knowledge."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No pre-registration mentioned. The 75-user beta test constitutes a human subjects study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No IRB or ethics board approval is mentioned despite collecting data from 75 human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "Participants are described only as '75 beta users from diverse programming backgrounds.' No experience levels, roles, or other demographics are reported."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No criteria for who was eligible to participate in the beta test are stated."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "This is not an experimental study with treatment/control conditions. All users used the same system."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "Not an experimental study with conditions requiring blinding."
    263       },
    264       "attrition_reported": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "85% retention rate is mentioned but no detail on how many started vs finished, or reasons for dropout."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The system calls the Groq API for every code review but no API costs, tokens consumed, or cost per review are reported."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No computational budget, hosting costs, or resource requirements are stated."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The system achieves 92.3% accuracy in code issue detection.",
    286       "evidence": "Table II reports 92.3% code analysis accuracy against a benchmark of >85%. No methodology for measuring accuracy is described — no ground truth, test set, or evaluation protocol.",
    287       "supported": "unsupported"
    288     },
    289     {
    290       "claim": "The AI Code Review Assistant shows 34% better accuracy in identifying critical code issues compared to traditional static analysis tools.",
    291       "evidence": "Section V.C states this comparison but names no specific tools, describes no methodology, and provides no data.",
    292       "supported": "unsupported"
    293     },
    294     {
    295       "claim": "User satisfaction score is 4.2/5.0 based on 75 beta users.",
    296       "evidence": "Table II and Section V.B report this figure from a 6-week beta test. No survey instrument, collection methodology, or raw data is provided.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "The system achieves 85% user retention rate after initial trial period.",
    301       "evidence": "Section V.B mentions this metric but provides no definition of retention, no timeframe specifics, and no methodology.",
    302       "supported": "weak"
    303     }
    304   ],
    305   "methodology_tags": ["case-study"],
    306   "key_findings": "The paper describes a web-based AI code review tool built with Next.js, Firebase, and Groq API. It reports 92.3% code analysis accuracy and 4.2/5.0 user satisfaction from 75 beta testers over 6 weeks. However, no evaluation methodology is described for the accuracy claim, no baselines are properly compared, and no raw data or instruments are provided. The paper is essentially a system description with unverifiable performance claims.",
    307   "red_flags": [
    308     {
    309       "flag": "Unverifiable accuracy claims",
    310       "detail": "The paper claims 92.3% code analysis accuracy but never describes what ground truth was used, how accuracy was measured, or what constitutes a 'code issue.' The number appears without any supporting methodology."
    311     },
    312     {
    313       "flag": "Phantom baselines",
    314       "detail": "The claim of '34% better accuracy compared to traditional static analysis tools' names no specific tools, describes no comparison methodology, and provides no data to support it."
    315     },
    316     {
    317       "flag": "No negative results",
    318       "detail": "Every metric reported exceeds its benchmark. No failures, limitations in accuracy, or negative user feedback is discussed."
    319     },
    320     {
    321       "flag": "Suspiciously clean results",
    322       "detail": "All metrics conveniently exceed their stated benchmarks (Table II). No variance, uncertainty, or per-category breakdown is provided."
    323     },
    324     {
    325       "flag": "Undisclosed AI model",
    326       "detail": "The paper uses 'Groq API' but never identifies which underlying LLM model is being called, making the work impossible to reproduce."
    327     }
    328   ],
    329   "cited_papers": [
    330     {
    331       "title": "Expectations, outcomes, and challenges of modern code review",
    332       "authors": ["A. Bacchelli", "C. Bird"],
    333       "year": 2013,
    334       "relevance": "Foundational paper on code review practices that informs the motivation for AI-assisted code review tools."
    335     },
    336     {
    337       "title": "Evaluating large language models trained on code",
    338       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    339       "year": 2021,
    340       "arxiv_id": "2107.03374",
    341       "relevance": "Codex/HumanEval paper, foundational for LLM-based code generation and analysis evaluation."
    342     },
    343     {
    344       "title": "Language models are few-shot learners",
    345       "authors": ["T. Brown", "B. Mann", "N. Ryder"],
    346       "year": 2020,
    347       "relevance": "GPT-3 paper establishing capabilities of large language models for few-shot tasks including code."
    348     }
    349   ]
    350 }

Impressum · Datenschutz