ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25725B)


      1 {
      2   "paper": {
      3     "title": "Rethinking Code Review Workflows with LLM Assistance: An Empirical Study",
      4     "authors": [
      5       "Fannar Steinn Aðalsteinsson",
      6       "Björn Borgar Magnússon",
      7       "Mislav Milicevic",
      8       "Adam Nirving Davidsson",
      9       "Chih-Hong Cheng"
     10     ],
     11     "year": 2025,
     12     "venue": "International Symposium on Empirical Software Engineering and Measurement (ESEM)",
     13     "arxiv_id": "2505.16339",
     14     "doi": "10.1109/ESEM64174.2025.00013"
     15   },
     16   "scan_version": 3,
     17   "active_modules": [],
     18   "methodology_tags": ["qualitative", "case-study"],
     19   "key_findings": "This qualitative field study at WirelessCar finds that developers generally prefer AI-led code review assistance (Mode A, co-reviewer with upfront summaries) over on-demand interaction (Mode B), particularly for large or unfamiliar pull requests. Key challenges in traditional reviews include context switching, delayed reviews, and insufficient contextual information, which RAG-enabled AI tools can partially address. However, trust, false positives, response latency, and tool integration into existing environments remain barriers to adoption.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Source code for the artifact is released under GPLv3 on GitHub: frontend (https://github.com/BearPays/code-review-assistant-ui) and backend (https://github.com/BearPays/code-review-assistant-back), as stated in Section IV.B.3."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "Interview transcripts, observation notes, and thematic analysis codebooks are not released. The paper reports only aggregated qualitative themes."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No requirements.txt, Dockerfile, or detailed environment setup is provided. The paper mentions LlamaIndex and OpenAI API but does not specify library versions or dependencies needed to replicate the tool."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions are provided for either the qualitative study methodology or the software artifact setup."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "This is a qualitative study using thematic analysis of interview data. No quantitative metrics are reported that would require confidence intervals or error bars."
     48       },
     49       "significance_tests": {
     50         "applies": false,
     51         "answer": false,
     52         "justification": "No quantitative comparative claims are made. All findings are derived from qualitative thematic analysis of interviews."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": false,
     56         "answer": false,
     57         "justification": "Purely qualitative study with no quantitative effect measurements."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "Phase 1 (7 participants) is justified via data saturation argument citing Guest et al. [15]. Phase 2 (10 participants) has no sample size justification — the number appears to be determined by availability rather than any principled criterion."
     63       },
     64       "variance_reported": {
     65         "applies": false,
     66         "answer": false,
     67         "justification": "No quantitative experimental results to report variance on. All results are qualitative themes from interviews."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The study compares two AI interaction modes (Mode A: co-reviewer, Mode B: interactive assistant) against each other and implicitly against traditional manual code review practices documented in Phase 1."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The baseline is current code review practice at WirelessCar, documented through Phase 1 interviews. This is the most relevant contemporary baseline for a qualitative study of developer experience."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "The RAG pipeline has multiple components (search_pr, search_code, search_requirements, start_review sub-agent) but no ablation study examines the contribution of individual components."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No formal quantitative evaluation metrics are used. Evaluation is entirely qualitative via thematic analysis of interviews, without structured rating scales, Likert items, or other formal instruments."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The entire Phase 2 evaluation is human-based: 10 developers used both tool modes on real PRs and provided feedback through post-experiment semi-structured interviews (Section IV.B)."
     95       },
     96       "held_out_test_set": {
     97         "applies": false,
     98         "answer": false,
     99         "justification": "Not applicable — this is a qualitative field study, not a benchmark evaluation requiring train/test splits."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down by theme (4 themes in Phase 2: accuracy/trust, efficiency/thoroughness, design expectations, usage contexts) and by Mode A vs Mode B preferences. Differences between familiar and unfamiliar participants are also discussed."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Multiple failure cases are discussed: incorrect import flagging, unclear suggestions ('Sometimes it says some slightly strange things' — P8), difficulty distinguishing important from minor findings, and cases where the tool did not meet expectations (Section V.B.1, V.B.3)."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper reports negative findings including false positives, trust concerns, over-reliance risks, long response times, difficulty parsing lengthy AI output, and participants questioning whether the tool was 'doing what it was asked' (Section V.B)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims 'AI-led reviews are overall more preferred, while still being conditional on the reviewers' familiarity with the code base, as well as on the severity of the pull request.' Both claims are supported by participant quotes and thematic analysis in Section V.B.4."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper uses causal language such as 'the assistant reduced the effort involved in reviewing' and 'the AI assistant could speed up the review process' (Section V.B.2), but these claims are based on developer self-reports from a qualitative study without controlled measurement of actual efficiency or effort. The field experiment design does not support causal inference about effectiveness."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "While the methodology sections note the study is at WirelessCar, the title ('Rethinking Code Review Workflows with LLM Assistance') and implications section (Section VI) make broad prescriptive claims about how AI should be integrated into code review workflows generally, beyond what 10 participants at one company support."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "No alternative explanations are discussed for the observed preferences. Novelty effect, demand characteristics (participants wanting to please researchers who were present), the specific PRs chosen, or selection bias toward AI-enthusiastic volunteers are not considered."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper measures developer preferences and perceptions via interviews but frames findings as evidence for how LLMs can 'improve developer experience and potentially support review efficiency' (Section I). The gap between reported preferences and actual review quality/efficiency improvements is not discussed."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "The paper states 'OpenAI's o4-mini' (Section IV.B.3) with only a link to the general models documentation page. No snapshot date, API version, or specific model ID is provided. Per schema rules, marketing names without a snapshot date do not count."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The Mode A sub-agent uses 'a detailed review-specific prompt' (Section IV.B.3) but the actual prompt text is not provided anywhere in the paper or supplementary materials. Only natural-language descriptions of the prompting approach are given."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported for the OpenAI o4-mini API calls."
    159       },
    160       "scaffolding_described": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "The agentic tool structure is described in Section IV.B.3 with a diagram (Fig. 3). Three core semantic tools (search_pr, search_code, search_requirements) and the Mode A sub-agent with start_review are described. The RAG infrastructure using LlamaIndex is documented, including what data sources feed each tool."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The paper mentions 'The RAG index had to be manually prepared and indexed before experiments' but does not describe how code diffs, source files, and Jira tickets were preprocessed, chunked, or indexed for the RAG pipeline. The thematic analysis coding process is also not detailed."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "There is no dedicated limitations or threats-to-validity section. The concluding remarks (Section VII) mention some concerns briefly but do not provide substantive limitations discussion."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No specific threats to validity are discussed. The paper acknowledges convenience sampling is 'limited in randomness' (Section IV.A) but does not systematically address construct, internal, or external validity threats."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The paper does not explicitly state what the results do NOT show. The implications section (VI) makes broad prescriptive claims without bounding them to the single-company, small-sample context of the study."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "Interview transcripts, observation notes, and coded thematic data are not released. Only selected quotes and aggregated themes are presented."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Data collection is well described: semi-structured interviews (15-40 minutes), in-person or via Teams, interview domains specified, pilot testing conducted, observation notes taken during Phase 2 sessions (Section IV.A, IV.B)."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Recruitment via convenience sampling through Slack channel announcements and informal messages is described for both phases. The paper notes they sought reviewers 'at varying seniorities' and from different teams (Section IV.A, IV.B.1)."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "The thematic analysis process is not documented in detail. The paper cites Braun and Clarke [12] for the method but does not describe coding procedures, inter-rater reliability, how codes were grouped into themes, or how many rounds of coding were performed."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding source or acknowledgments section is present in the paper."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations with WirelessCar Sweden AB and Chalmers University of Technology / University of Gothenburg are clearly stated in the header. The first two authors are affiliated with both WirelessCar and Chalmers."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "Multiple authors are affiliated with WirelessCar, the company where the study was conducted and which would benefit from positive findings about AI-assisted code review. No funding independence statement is provided."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "This paper does not evaluate a pre-trained model's capability on any benchmark. It is a qualitative study of developer experience with an AI code review tool."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No benchmark evaluation is performed. The study evaluates developer perceptions, not model performance on test sets."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No benchmark evaluation is performed."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "No pre-registration is mentioned. The study involves 7 interviewees (Phase 1) and 10 experiment participants (Phase 2) but was not pre-registered."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": true,
    257         "answer": false,
    258         "justification": "No IRB or ethics board approval is mentioned anywhere in the paper, despite conducting interviews and experiments with human participants."
    259       },
    260       "demographics_reported": {
    261         "applies": true,
    262         "answer": false,
    263         "justification": "Tables I and II list participant IDs, roles, and team assignments. The paper mentions 'The interviewees varied in gender and age' but does not report specific demographics such as years of experience, age distribution, gender breakdown, or programming expertise."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": true,
    267         "answer": false,
    268         "justification": "No formal inclusion/exclusion criteria are stated. Participants were sought who 'handled different parts of the code and were reviewers at varying seniorities' but this is a preference, not a formal criterion. Recruitment was via convenience sampling."
    269       },
    270       "randomization_described": {
    271         "applies": true,
    272         "answer": true,
    273         "justification": "The assignment of interaction modes to PRs was counterbalanced: 'The assignment of modes to PRs was rotated across participants to mitigate ordering effects' (Section IV.B.2). Each participant reviewed both PRs with alternating modes."
    274       },
    275       "blinding_described": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "No blinding is described. Participants knew which mode they were using (Mode A provides upfront summaries, Mode B does not), and researchers were present during sessions observing interactions."
    279       },
    280       "attrition_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "For Phase 1, the paper reports that 'an eighth interview had been scheduled, the participant canceled' (Section IV.A). For Phase 2, 5 of 7 Phase 1 participants returned, plus 5 new recruits, reaching 10 total. All 10 completed the experiment."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "The tool uses OpenAI o4-mini API with RAG, but no inference cost, tokens consumed, or latency measurements are reported. Participants complained about response times but no quantitative latency data is given."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No total computational budget, API costs, or hardware specifications are stated."
    296       }
    297     }
    298   },
    299   "claims": [
    300     {
    301       "claim": "AI-led reviews (Mode A) are generally preferred over on-demand interaction (Mode B), especially for large or unfamiliar pull requests.",
    302       "evidence": "Multiple participant quotes in Section V.B.4: P12 preferred Mode A for getting 'the overview directly'; P7 described Mode A as 'quite clever, and I would gladly use that'; P11 preferred Mode A for low-risk changes; P8 saw Mode A as valuable for newcomers.",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "Context switching, delayed reviews, and insufficient contextual information are key challenges in traditional code reviews.",
    307       "evidence": "Phase 1 interviews in Section V.A.2: P7 describes 20 minutes lost per context switch; P2 notes large PRs remain unreviewed; multiple interviewees report missing context in PR descriptions.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "RAG-based contextual support (code diffs, source files, Jira tickets) is valued by reviewers for reducing effort and improving understanding.",
    312       "evidence": "Section V.B.2: P3 values the Jira integration ('The first thing I do every time is I open the ticket anyway'). P10 notes the assistant could identify issues that might otherwise go unnoticed.",
    313       "supported": "moderate"
    314     },
    315     {
    316       "claim": "Trust, false positives, and response latency are key barriers to AI code review tool adoption.",
    317       "evidence": "Section V.B.1: P8 notes the assistant 'says some slightly strange things'; P3 worries about being 'colored' by AI suggestions; P7 notes trust is required. Section V.B.3: P3 cites speed as a major barrier.",
    318       "supported": "moderate"
    319     },
    320     {
    321       "claim": "LLMs can meaningfully augment, rather than replace, human reviewers in code review.",
    322       "evidence": "Section VII concluding remarks and Section VI implications. Based on qualitative preferences from 10 participants at a single company.",
    323       "supported": "weak"
    324     }
    325   ],
    326   "red_flags": [
    327     {
    328       "flag": "Very small sample size",
    329       "detail": "7 interviewees in Phase 1 and 10 participants in Phase 2, all from a single company (WirelessCar). Phase 2 sample size is not justified. Broad implications are drawn from this narrow base."
    330     },
    331     {
    332       "flag": "Convenience sampling with likely selection bias",
    333       "detail": "Participants were recruited via Slack channels and informal messages. This self-selection likely biases toward developers who are interested in or favorable toward AI tools, not representative of the general developer population."
    334     },
    335     {
    336       "flag": "Company conflict of interest",
    337       "detail": "Multiple authors are WirelessCar employees, and the study was conducted at WirelessCar. Positive findings about AI-assisted code review benefit the company's modernization narrative. No conflict of interest statement is provided."
    338     },
    339     {
    340       "flag": "No IRB or ethics approval",
    341       "detail": "A study involving human participants in interviews and experiments at a company mentions no ethics review, which is a significant omission for a paper published at a major empirical SE venue."
    342     },
    343     {
    344       "flag": "Demand characteristics not addressed",
    345       "detail": "Researchers were present during all experiment sessions, participants were given 'onboarding briefings' about the tool, and were 'occasionally guided to try rephrasing' queries. This creates strong demand characteristics — participants may report more positive experiences to please the researchers who built the tool."
    346     },
    347     {
    348       "flag": "No limitations section",
    349       "detail": "Despite significant methodological constraints (small sample, single company, convenience sampling, no blinding, researcher presence), the paper has no dedicated limitations or threats-to-validity section."
    350     }
    351   ],
    352   "cited_papers": [
    353     {
    354       "title": "GPT-4 technical report",
    355       "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"],
    356       "year": 2023,
    357       "arxiv_id": "2303.08774",
    358       "relevance": "Foundational LLM technical report relevant to understanding capabilities applied to code review."
    359     },
    360     {
    361       "title": "Large language models for software engineering: A systematic literature review",
    362       "authors": ["Xinyi Hou", "Yanjie Zhao", "Yue Liu"],
    363       "year": 2024,
    364       "relevance": "Comprehensive survey of LLM applications in software engineering tasks including code review."
    365     },
    366     {
    367       "title": "Deep learning-based code reviews: A paradigm shift or a double-edged sword?",
    368       "authors": ["Rosalia Tufano", "Antonio Martin-Lopez", "Ayman Tayeb"],
    369       "year": 2024,
    370       "arxiv_id": "2411.11401",
    371       "relevance": "Evaluates the measurable impact of AI-generated code reviews in controlled experiments, directly relevant to LLM-assisted code review effectiveness."
    372     },
    373     {
    374       "title": "Improving automated code reviews: Learning from experience",
    375       "authors": ["Hsiao Yi Lin", "Patanamon Thongtanunam", "Christoph Treude"],
    376       "year": 2024,
    377       "relevance": "Studies fine-tuning LLMs for better issue detection in code reviews."
    378     },
    379     {
    380       "title": "AI-assisted assessment of coding practices in modern code review",
    381       "authors": ["Manushree Vijayvergiya", "Maciej Salawa", "Ivan Budiselic"],
    382       "year": 2024,
    383       "relevance": "Large-scale deployment of LLM-based automated coding standards enforcement in code reviews."
    384     },
    385     {
    386       "title": "AI-powered code review with LLMs: Early results",
    387       "authors": ["Zeeshan Rasheed", "Muhammad Arbab Sami", "Muhammad Waseem"],
    388       "year": 2024,
    389       "arxiv_id": "2404.18496",
    390       "relevance": "Multi-agent LLM system for autonomous code reviews, directly comparable approach."
    391     },
    392     {
    393       "title": "Human and machine: How software engineers perceive and engage with AI-assisted code reviews compared to their peers",
    394       "authors": ["Adam Alami", "Neil A. Ernst"],
    395       "year": 2025,
    396       "arxiv_id": "2501.02092",
    397       "relevance": "Qualitative study of developer emotional and cognitive responses to AI-provided vs human-provided code review feedback."
    398     },
    399     {
    400       "title": "Towards automating code review activities",
    401       "authors": ["Rosalia Tufano", "Luca Pascarella", "Michele Tufano"],
    402       "year": 2021,
    403       "relevance": "Early deep learning approach to automating code review, foundational work in the space."
    404     },
    405     {
    406       "title": "A survey on hallucination in large language models: Principles, taxonomy, challenges, and open questions",
    407       "authors": ["Lei Huang", "Weijiang Yu", "Weitao Ma"],
    408       "year": 2025,
    409       "relevance": "Survey on LLM hallucination, directly relevant to trust and accuracy concerns in AI-assisted code review."
    410     }
    411   ],
    412   "engagement_factors": {
    413     "practical_relevance": {
    414       "score": 2,
    415       "justification": "Provides actionable design insights for teams building AI code review tools, and releases prototype source code."
    416     },
    417     "surprise_contrarian": {
    418       "score": 0,
    419       "justification": "Confirms expected findings: developers like AI summaries, worry about false positives, prefer integration into existing tools."
    420     },
    421     "fear_safety": {
    422       "score": 0,
    423       "justification": "No AI safety or security concerns raised beyond generic false positive warnings."
    424     },
    425     "drama_conflict": {
    426       "score": 0,
    427       "justification": "No controversy or conflict — straightforward empirical study with unsurprising findings."
    428     },
    429     "demo_ability": {
    430       "score": 2,
    431       "justification": "Frontend and backend code released on GitHub under GPLv3, though setup requires OpenAI API key and RAG indexing."
    432     },
    433     "brand_recognition": {
    434       "score": 1,
    435       "justification": "Uses OpenAI o4-mini but study is from WirelessCar/Chalmers, not a high-profile AI lab."
    436     }
    437   }
    438 }

Impressum · Datenschutz