scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24844B)
      1 {
      2   "paper": {
      3     "title": "Intuition to Evidence: Measuring AI's True Impact on Developer Productivity",
      4     "authors": [
      5       "Anand Kumar",
      6       "Vishal Khare",
      7       "Deepak Sharma",
      8       "Satyam Kumar",
      9       "Vijay Saini",
     10       "Anshul Yadav",
     11       "Sachendra Jain",
     12       "Ankit Rana",
     13       "Pratham Verma",
     14       "Vaibhav Meena",
     15       "Avinash Edubilli"
     16     ],
     17     "year": 2025,
     18     "venue": "arXiv",
     19     "arxiv_id": "2509.19708"
     20   },
     21   "scan_version": 2,
     22   "active_modules": [],
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Section 8 (Annexure) lists five GitHub repository URLs for the DeputyDev platform components (extension backend, VS Code extension, binary distribution, core engine, authentication service)."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No dataset is released. Section 7.3.2 mentions plans to 'release anonymised versions of our dataset' as future work, but no data is currently available."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No environment specifications, dependency files, or setup instructions are provided for reproducing the study or running the platform."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions are provided. The study's analysis pipeline, statistical scripts, and data processing steps are not documented in a reproducible form."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Standard deviations reported for cycle time and review time metrics: 'mean cycle time of 150.5h (±13.1h)' and 'review time of 128.8h (±16.1h)' in Section 5.2.1."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "P-values reported for main results: '33.8% cycle time reduction (p = 0.0018)' and '29.8% review time reduction (p = 0.0076)' in Section 5.2.1. Cohen's d also provided."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Cohen's d = 1.42 reported for code productivity increase (Section 5.3.3). Percentage improvements with baselines provided throughout (e.g., 33.8% reduction from 150.5h to 99.6h)."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No power analysis or justification for the sample sizes used. The N=300 total and N=30 per cohort appear to be convenience samples based on the organization's size, not justified by statistical reasoning."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Standard deviations reported alongside means: '150.5h (±13.1h)', '99.6h (±23.7h)', '128.8h (±16.1h)', '90.5h (±20.1h)' in Section 5.2.1/5.3.3."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Within-subjects pre-deployment baseline (Sep 2024 - Feb 2025) compared against post-deployment (Mar 2025 - Aug 2025). Between-subjects comparison of high vs low adoption cohorts also provided."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The baseline is the same engineers' pre-deployment performance, which is the most appropriate contemporary comparison for a deployment study."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "The system has two major components (code generation and PR review) but no ablation study isolates their individual contributions. The components were also deployed at different times, but this temporal difference is not leveraged as an ablation."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Multiple metrics reported: PR cycle time, review time, code volume (LOC shipped), acceptance rates, survey satisfaction, NPS score, cost per engineer."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Developer surveys (228 respondents) and NPS survey (125 respondents) evaluated the tool's effectiveness. Qualitative interviews with 125 engineers also conducted (Section 4.5)."
     98       },
     99       "held_out_test_set": {
    100         "applies": false,
    101         "answer": false,
    102         "justification": "Not applicable — this is a deployment study measuring real-world productivity, not a benchmark evaluation requiring train/test splits."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Results broken down by experience level (SDE1-3 in Table 3), by adoption cohort (high vs low in Section 5.3), and by usage category (Figure 8)."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 6.3 'What Didn't Work' discusses specific failures: generic models ineffective for specialized codebases, automatic acceptance led to quality issues, over-automation created resistance."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Low adoption cohort showed -11.4% decline in shipped code (Section 5.3.3). Section 6.3 reports approaches that failed. Code generation satisfaction was lower (57%) than review (85%)."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Abstract claims of 31.8% cycle time reduction, 85% code review satisfaction, 93% desire to continue, adoption scaling from 4% to 83% — all supported by specific numbers in the results sections."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper makes causal claims ('demonstrably improve developer productivity') based on a quasi-experimental design with self-selected adoption groups. High vs low adopters are endogenous — engineers who adopt more may be inherently more productive or motivated. The difference-in-differences approach is claimed but the 'treatment' (adoption level) is not randomly assigned. Section 5.3.5 claims to rule out alternative explanations but the selection bias concern is not adequately addressed by baseline matching alone."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title claims to measure 'AI's True Impact on Developer Productivity' — an unbounded generalization from a single-organization study of an in-house tool. While Section 7.2 acknowledges single-org limitation, the title and abstract frame results as broadly applicable."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Section 5.3.5 explicitly addresses selection bias, motivation effects, training disparities, and tool availability as alternative explanations. Section 4.6 discusses Hawthorne effects, maturation, and instrumentation changes."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "The paper measures PR cycle time and lines of code shipped and frames these as 'developer productivity' without discussing the gap between these proxies and actual productivity. LOC is a particularly problematic proxy — more lines shipped could mean more verbose AI-generated code, not more productive work. No discussion of whether faster reviews compromise quality or whether code volume reflects value delivered."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Section 3.1 mentions 'Claude Sonnet 3.7 and 4.0 models' — these are marketing names without specific API versions or snapshot dates. No version information for other LLM providers mentioned (OpenAI, Vertex AI)."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "No prompts or system instructions for any of the six review agents or the code generation system are provided. Only high-level descriptions of each agent's focus area."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "No LLM hyperparameters (temperature, top-p, max tokens) reported for any of the agents or code generation system."
    162       },
    163       "scaffolding_described": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 3.1 describes the multi-agent PR review architecture with six specialized agents, their tools (File Reader, Path Searcher, Grep, Planner), and the comment blending engine. Section 3.2 describes the code generation architecture with Weaviate vector DB and tool suite. Figures 3 and 5 provide architecture diagrams."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": false,
    171         "justification": "Section 4.5 lists data sources (webhooks, instrumentation, surveys) but does not document how raw data was cleaned, filtered, or transformed into the reported metrics. No filtering criteria, outlier handling, or data cleaning steps described."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 7.2 'Limitations' provides a dedicated subsection discussing four specific limitations."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 4.6 discusses specific threats: selection bias addressed through within-subjects design and propensity score matching, Hawthorne effects mitigated by not informing participants, maturation controlled through baseline period. These are specific to this study."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 7.2 states specific boundaries: single-organization, in-house system may differ from public tools, 1-year observation may miss long-term patterns, cultural/regional factors may limit applicability."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "No raw data released. Section 7.3.2 promises future release of 'anonymised versions of our dataset' but nothing is currently available."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 4.5 describes four data collection channels: version control analytics via webhooks, code generation metrics via DeputyDev instrumentation, code review analytics, and performance benchmarking. Multi-source validation with quantitative metrics, surveys (228 engineers, 76% response rate), interviews (125 engineers), and manager assessments."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "For the main study, all 300 engineers used the tool — but how teams/engineers were selected for inclusion is not described. For the survey, it states 228 developers 'participated across teams' but no recruitment method is described. For the NPS survey (125 participants), selection method is not explained."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": false,
    210         "justification": "The pipeline from raw webhook/instrumentation data to final reported metrics is not documented. No filtering criteria, no counts of excluded data points, no description of how metrics like 'cycle time' were computed from raw PR events."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding disclosure, acknowledgments section, or statement about financial support. All authors are from 1mg.com, a commercial entity, suggesting corporate funding but this is not explicitly stated."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "All 11 authors list @1mg.com email addresses, clearly indicating their affiliation with the company that built and deployed DeputyDev."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "The authors work at 1mg (Tata 1mg), the company that built DeputyDev. The company has direct financial interest in demonstrating the tool's effectiveness — positive results justify continued investment and could be used for marketing. This is a textbook non-independent funder situation."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial disclosure statement. The authors are employees of the company whose product is being evaluated, yet no conflict of interest is acknowledged."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "This study does not evaluate a pre-trained model on a benchmark. It measures deployment impact of an AI tool on developer productivity."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "Not applicable — no benchmark evaluation is performed."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "Not applicable — no benchmark evaluation is performed."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": true,
    255         "answer": false,
    256         "justification": "No mention of pre-registration. The study involves 300 engineers as participants and surveys of 228 and 125 engineers, but no pre-registration is referenced."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "No IRB or ethics board approval mentioned despite studying 300 engineers' work patterns and conducting surveys. The paper also claims participants were not informed they were being observed (Section 4.6), which raises ethical concerns."
    262       },
    263       "demographics_reported": {
    264         "applies": true,
    265         "answer": true,
    266         "justification": "Experience levels (SDE1, SDE2, SDE3) reported with stratified results in Table 3. Survey respondent distribution by role: Backend 64%, Frontend 17%, Android/iOS 14%, QA 5% (Section 5.4)."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": true,
    270         "answer": false,
    271         "justification": "No explicit inclusion or exclusion criteria stated for which of the 300 engineers were included in the study or how cohorts were defined beyond percentile-based adoption thresholds."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "Not applicable — this is an observational/quasi-experimental study. The authors explicitly state 'random assignment of engineers to control and treatment groups would disrupt existing team structures' (Section 4.1)."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "Not applicable — observational study design precludes blinding. Engineers knew they were using the AI tool."
    282       },
    283       "attrition_reported": {
    284         "applies": true,
    285         "answer": true,
    286         "justification": "Survey response rate reported: 228 out of ~300 (76%). NPS survey had 125 participants. Adoption stabilization from 83% peak to 60% sustained engagement is reported. 94.2% retention rate mentioned in Figure 6."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Section 5.6 provides detailed monthly cost breakdown in Table 5, including per-provider LLM API costs and cost per engineer ($30-34/month in August 2025)."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": true,
    298         "justification": "Table 5 reports total 5-month operational cost of $46,833, broken down by LLM API costs (Bedrock, OpenAI, Vertex AI) and infrastructure costs (compute, database, cache). Annualized cost estimated at ~$112,000."
    299       }
    300     }
    301   },
    302   "claims": [
    303     {
    304       "claim": "31.8% overall reduction in PR review cycle time after AI tool deployment",
    305       "evidence": "Section 5.2.1: Cohort 1 baseline mean cycle time 150.5h reduced to 99.6h (33.8% reduction, p=0.0018), review time from 128.8h to 90.5h (29.8% reduction, p=0.0076).",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "Top 30 adopters achieved 61% increase in shipped code volume",
    310       "evidence": "Section 5.3.3: High adoption cohort shipped 168,676→272,191 LOC (61.3% increase, p<0.001, Cohen's d=1.42), with 150k AI-generated lines merged to production.",
    311       "supported": "weak"
    312     },
    313     {
    314       "claim": "Low adoption cohort experienced 11.4% decline in shipped code",
    315       "evidence": "Section 5.3.3: Low adoption cohort shipped 253,332→224,282 LOC (-11.4%, p=0.08, Cohen's d=-0.31).",
    316       "supported": "weak"
    317     },
    318     {
    319       "claim": "Junior engineers (SDE1) showed highest productivity gain at 77%",
    320       "evidence": "Table 3 and Figure 10: SDE1 code production increased from 80,492 to 142,354 LOC (77% improvement) for top adopters.",
    321       "supported": "weak"
    322     },
    323     {
    324       "claim": "85% satisfaction for code review and 93% desire to continue using platform",
    325       "evidence": "Section 5.4: 194/228 respondents (85%) want continued PR reviews, 93% plan to keep DeputyDev. Table 4 summarizes survey KPIs.",
    326       "supported": "moderate"
    327     },
    328     {
    329       "claim": "28% increase in overall production code shipment volume",
    330       "evidence": "Abstract and Section 7: '28% increase in code shipment volume' attributed to AI-generated code comprising 30-40% of shipped code. Detailed in Figure 2.",
    331       "supported": "weak"
    332     }
    333   ],
    334   "methodology_tags": [
    335     "observational",
    336     "case-study"
    337   ],
    338   "key_findings": "A year-long deployment of an in-house AI platform (DeputyDev) across 300 engineers at 1mg showed 31.8% reduction in PR cycle time (p=0.0018) and 61% increase in shipped code for top adopters. Adoption scaled from 4% to 83% peak before stabilizing at 60%. The study is significantly compromised by the authors evaluating their own company's product, self-selected adoption cohorts used for causal claims, and lines of code as the primary productivity proxy without validity discussion.",
    339   "red_flags": [
    340     {
    341       "flag": "Company evaluating its own product",
    342       "detail": "All 11 authors are employees of 1mg.com, the company that built DeputyDev. No external evaluators or independent replication. No conflict of interest statement despite obvious financial interest in positive results. This is precisely the Wakefield-pattern conflict the checklist is designed to catch."
    343     },
    344     {
    345       "flag": "Self-selected adoption groups used for causal claims",
    346       "detail": "The high vs low adoption comparison (Section 5.3) is used to claim causal effects, but adoption level is endogenous — engineers who adopt AI tools more enthusiastically may differ in motivation, skill, or work style. The paper claims to address this through baseline matching but cannot rule out unobserved confounders driving both adoption and productivity."
    347     },
    348     {
    349       "flag": "Lines of code as productivity metric",
    350       "detail": "LOC shipped is the primary productivity metric, but AI-generated code inflates LOC by design. A 61% increase in LOC with 150k AI-generated lines could simply mean more verbose code, not more productive work. The paper does not discuss whether code volume correlates with value delivered."
    351     },
    352     {
    353       "flag": "No IRB/ethics approval for covert observation",
    354       "detail": "The paper states 'individuals in the experiment were not informed that they were under observation' (Section 4.6) to address Hawthorne effects, but conducting covert research on 300 employees without IRB approval or informed consent raises serious ethical concerns."
    355     },
    356     {
    357       "flag": "Self-citation of prior company paper",
    358       "detail": "Reference [3] is a prior paper by several of the same authors evaluating an earlier version of the same DeputyDev tool. The company is building a publication record evaluating its own product."
    359     },
    360     {
    361       "flag": "Inconsistent cohort definitions",
    362       "detail": "The paper uses 'Cohort 1' and 'Cohort 2' to mean different things in different sections — in Section 5.2.1 they refer to temporal periods (pre/post deployment), while in Section 5.3 they refer to adoption levels (high/low). This creates confusion about which comparisons support which claims."
    363     },
    364     {
    365       "flag": "Low adoption cohort decline unexplained",
    366       "detail": "The 11.4% decline in shipped code for low adopters (p=0.08, not significant) is presented as evidence that non-adoption hurts productivity, but this could reflect regression to the mean, seasonal variation, or other factors. The decline is not statistically significant."
    367     }
    368   ],
    369   "cited_papers": [
    370     {
    371       "title": "The impact of AI on developer productivity: Evidence from GitHub Copilot",
    372       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    373       "year": 2023,
    374       "arxiv_id": "2302.06590",
    375       "relevance": "Foundational RCT on Copilot's productivity impact (55.8% faster task completion), directly comparable methodology and claims."
    376     },
    377     {
    378       "title": "DeputyDev - AI Powered Developer Assistant: Breaking the Code Review Logjam through Contextual AI to Boost Developer Productivity",
    379       "authors": ["Vishal Khare", "Vijay Saini", "Deepak Sharma", "Anand Kumar", "Ankit Rana", "Anshul Yadav"],
    380       "year": 2025,
    381       "arxiv_id": "2508.09676",
    382       "relevance": "Prior paper by same team evaluating earlier DeputyDev version with A/B testing (23% review time reduction)."
    383     },
    384     {
    385       "title": "Measuring the impact of early-2025 AI on experienced open-source developer productivity",
    386       "authors": ["Anonymous Authors"],
    387       "year": 2025,
    388       "arxiv_id": "2507.09089",
    389       "relevance": "Contrasting finding: 19% increase in completion time (negative productivity effect) for experienced open-source developers using AI tools."
    390     },
    391     {
    392       "title": "Evaluating large language models trained on code",
    393       "authors": ["Mark Chen", "Jerry Tworek"],
    394       "year": 2021,
    395       "arxiv_id": "2107.03374",
    396       "relevance": "Introduced HumanEval benchmark for code generation evaluation, referenced as baseline benchmark approach."
    397     },
    398     {
    399       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    400       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"],
    401       "year": 2023,
    402       "arxiv_id": "2310.06770",
    403       "relevance": "Major benchmark for evaluating LLMs on real-world software engineering tasks."
    404     },
    405     {
    406       "title": "AI-based Code Review in Practice: A Survey of the Landscape and Directions",
    407       "authors": ["Kai Sun", "Ming Wen", "Zhen Yang"],
    408       "year": 2025,
    409       "arxiv_id": "2508.18771",
    410       "relevance": "Survey of 16 AI-based code review tools analyzing effectiveness of different review granularities."
    411     },
    412     {
    413       "title": "RepoFusion: Training Code Models to Understand Your Repository",
    414       "authors": ["Disha Shrivastava", "Denis Kocetkov"],
    415       "year": 2023,
    416       "arxiv_id": "2306.10998",
    417       "relevance": "Repository-context code generation technique used in DeputyDev's chunking algorithm."
    418     },
    419     {
    420       "title": "Long Code Arena: a Set of Benchmarks for Long-Context Code Models",
    421       "authors": ["Egor Bogomolov", "Aleksandra Eliseeva"],
    422       "year": 2024,
    423       "arxiv_id": "2406.11612",
    424       "relevance": "Benchmark suite for evaluating code models requiring project-wide context, relevant to code generation evaluation."
    425     }
    426   ]
    427 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs