scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27672B)
      1 {
      2   "paper": {
      3     "title": "EditFlow: Benchmarking and Optimizing Code Edit Recommendation Systems via Reconstruction of Developer Flows",
      4     "authors": ["Chenyan Liu", "Yun Lin", "Jiaxin Chang", "Jiawei Liu", "Binhang Qi", "Bo Jiang", "Zhiyong Huang", "Jin Song Dong"],
      5     "year": 2026,
      6     "venue": "OOPSLA (Proc. ACM Program. Lang.)",
      7     "arxiv_id": "2602.21697",
      8     "doi": "10.1145/3798249"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "The paper provides an anonymous website [3] (sites.google.com/view/editflow) with source code, auto-tuned prompt, dataset, and experiment results. They also implement a VS Code extension."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The annotated dataset of 100 commits with edit order labels is released via their website. The industrial dataset cannot be released due to compliance restrictions, but the annotated benchmark is available."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No mention of requirements.txt, Dockerfile, or detailed environment/dependency specifications in the paper."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are described in the paper. The website is referenced but the paper itself does not include specific commands or a reproduction guide."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Results in Tables 5-7 report point estimates only (percentages) with no confidence intervals or error bars."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The user study (Section 7.4.5-7.4.6) uses Mann-Whitney U test with permutation testing (10,000 resamples) and reports p-values for each task comparison."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Section 7.4.6 reports effect sizes (r) derived from the standardized U statistic for each task comparison, e.g., r=0.788 for Task 2 EG1 vs CG1."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The user study uses 32 participants (8 per group) with no power analysis or justification for the sample size."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No standard deviations, variance, or spread measures are reported for the benchmark experiments (Tables 5-6). Individual user times are shown in Table 7 but no aggregate variance metrics."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper compares against zero-shot, few-shot, hand-crafted prompt, and DSPy baselines for order recovery (Table 3), and evaluates Cursor, Claude Code, and CoEdPilot with/without EditFlow (Tables 5-6)."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Baselines include Cursor CLI (2025.09.18), Claude Code (1.0.113), CoEdPilot (2024), and DSPy — all contemporary systems."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "The comparison of Original vs w/ EditFlow for each baseline system (Tables 5-6) serves as an ablation showing the contribution of the flow-aware optimization component."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Multiple metric categories are used: flow categories (Keep/Jump/Revert/Break), precision/recall/F0.5, and resource usage metrics (latency, tokens, cost)."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "RQ4 (Section 7.4) presents a user study with 32 developers evaluating real-world task completion and perceived recommendation quality."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Section 7.1.2: 'we split the dataset at the commit level in a 7:3 ratio, ensuring that all samples from the same commit are assigned to the same split' — preventing intra-commit data leakage."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down per baseline system (Cursor, Claude Code, CoEdPilot), per task in the user study (Tasks 1-3), and per flow category (Keep/Jump/Revert/Break)."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 8 provides a detailed failure analysis with two specific failure modes (false rejection due to k-context sensitivity and acceptance of incorrect flow-keeping edits), including concrete examples."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper reports that EditFlow does not show statistically significant improvement on Tasks 1 and 3 in the user study, and discusses why (Section 7.4.6). Recall decreases by 7.09% on average."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims (63.81% order reconstruction improvement, 75% flow violation reduction, 66.99% precision improvement, 25.11% faster completion) are all supported by Tables 3-7 in the results sections."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Causal claims like 'EditFlow improves precision' are supported by controlled comparisons (Original vs w/ EditFlow on same benchmarks). The user study uses random group assignment with statistical testing."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The title claims to benchmark 'Code Edit Recommendation Systems' generally, but evaluation is limited to Python commits only. The paper acknowledges this in Section 9 (External validity) but the title and abstract do not bound to Python."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Section 9 (Threats to Validity) discusses multiple alternative explanations: digital twin assumes correct developer decisions, edit-order data may not be optimal, violation-based metric may introduce optimistic bias, and LLM stochasticity."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper explicitly frames 'mental flow' as a cognitive construct and operationalizes it via pairwise edit order relations. It discusses the gap between the proxy (edit ordering) and the construct (cognitive flow state), noting that 'our operationalization through the Keep/Jump/Revert/Break taxonomy may only approximate developers' cognitive states' (Section 9)."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Section 7.1.1 specifies 'Claude-Sonnet-4-20250514' with exact snapshot date. Section 7.3.3 gives exact versions: 'Claude Code (Version 1.0.113), Cursor CLI (Version 2025.09.18-7ae6800)'."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "The auto-tuned prompt is central to the method but is not included in the paper. The paper says 'For the detailed learned prompt, please refer to our anonymous website [3]' (Section 6.1). The prompt itself is not in the paper or appendix."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Section 7.1.1: 'maximum output length of 4096 tokens and a temperature of 0.7. The auto-tuning underwent 5 epochs and a batch size of 32.'"
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "The paper evaluates Cursor CLI and Claude Code as black boxes ('relied on their default underlying models without manually specifying a particular model'). While EditFlow's own wrapper is described, the underlying systems' scaffolding is not. Per schema rules, this should be NA for the third-party tools, but EditFlow's own scaffolding (filter and re-rank) is described. However, the digital twin integration approach is only briefly described."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 7.3.2 documents commit selection criteria: '(1) containing 5-10 edit hunks across at least 2 source files; (2) involving real user authorship; (3) excluding merge commits and filename changes; (4) maintaining ASCII-only content with meaningful code modifications.'"
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 9 'Threats to Validity' provides a substantive discussion of external, construct, and internal validity threats."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 9 discusses specific threats: Python-only benchmark composition, the single-trajectory limitation of edit order data, the digital twin's assumption of correct developer decisions, and LLM stochasticity affecting prediction stability."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 9 explicitly states limitations: 'our data composition may limit the generalizability of our findings to other programming languages, development workflows, or industrial settings.' Also acknowledges the digital twin's simplifying assumptions."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The annotated dataset and experiment results are stated as available at the anonymous website. The industrial dataset cannot be released."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 7.1.2 describes the annotated dataset: '100 commits from the 45 most-starred open-source GitHub Python repositories, comprising 772 edit hunks and 1,747 directed edges.' Section 7.2.2 describes the industrial dataset: '500 commits from Jun. 2025 to Aug. 2025, containing 3,059 edit hunks.'"
    190       },
    191       "recruitment_methods_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section 7.4.2: 'We recruited 32 participants from 2 universities' with demographics (age 20-30, CS students from undergraduate to PhD, 4.5 days/week coding, 90% prior AI tool experience). Footnote 3 describes employee consent and anonymization for industrial data."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The annotation pipeline is described in Section 6.1: independent annotation by two authors, consensus resolution, 20 minutes per commit, 77 person-hours total. The data split is documented (7:3 at commit level)."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The Acknowledgments section lists multiple funding sources: National Natural Science Foundation of China, Ministry of Education Singapore, National Research Foundation Singapore, AI Singapore Programme, and Cyber Security Agency of Singapore."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly listed: Shanghai Jiao Tong University, National University of Singapore, and ByteDance. Bo Jiang from ByteDance is disclosed."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Funders are government research agencies (NSFC, Singapore MOE, NRF) that have no commercial stake in the outcome. ByteDance affiliation exists but they are not listed as a funder."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement is provided. One author is from ByteDance, which develops code editing tools, but no financial interest disclosure is included."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper uses Claude-Sonnet-4-20250514 and Cursor/Claude Code for evaluation but does not state the training data cutoff dates for these models."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "The paper does not discuss whether the GitHub commits used in benchmarks could have appeared in the training data of the LLMs used for order recovery or evaluation."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "The benchmark uses commits from popular GitHub repositories (most-starred Python repos). These are highly likely to be in training data for Claude and other models, but contamination risk is not discussed."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No pre-registration mentioned for the user study with 32 participants."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "No IRB or ethics board approval is mentioned for the user study. Footnote 3 mentions employee consent for industrial data but not ethics approval for the user study."
    251       },
    252       "demographics_reported": {
    253         "applies": true,
    254         "answer": true,
    255         "justification": "Section 7.4.2: participants aged 20-30, CS students (undergrad to PhD), coding 4.5 days/week on average, 90% with prior AI tool experience."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": true,
    259         "answer": true,
    260         "justification": "Section 7.4.2: 'All participants are required to complete a pre-study questionnaire to collect their background information, including educational level, programming proficiency, and prior experience with AI-assisted programming tools.'"
    261       },
    262       "randomization_described": {
    263         "applies": true,
    264         "answer": false,
    265         "justification": "The paper describes four groups (CG1, EG1, CG2, EG2) but does not explain how participants were assigned to groups (randomization procedure not described)."
    266       },
    267       "blinding_described": {
    268         "applies": true,
    269         "answer": false,
    270         "justification": "No mention of whether participants knew which condition they were in (with or without EditFlow). Blinding is not discussed."
    271       },
    272       "attrition_reported": {
    273         "applies": true,
    274         "answer": true,
    275         "justification": "Table 7 shows results for all 32 participants (P1-P32), 8 per group, with no apparent dropouts. All participants completed all 3 tasks."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": true,
    282         "justification": "Tables 5-6 report per-query resource usage including latency (seconds), token usage (K), and monetary cost ($) for each system with and without EditFlow."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No total computational budget (GPU hours, total API spend, total experiment cost) is reported. Only per-query costs are shown."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No discussion of random seed sensitivity. Section 9 acknowledges LLM stochasticity but does not report results across multiple seeds."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The paper does not state how many times each experiment was run. The digital twin simulation appears to be single-run."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The prompt auto-tuning uses 5 epochs but the total search budget (number of candidate prompts evaluated, compute cost) is not reported."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "Algorithm 1 describes the prompt selection procedure: accuracy on the full training set is used to select the best prompt at each epoch, which is a clearly defined selection criterion."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The user study performs multiple statistical comparisons (3 tasks × 2 system pairs = 6 tests) but no multiple comparison correction (Bonferroni, etc.) is applied."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors evaluate their own EditFlow system against baselines without acknowledging author-evaluation bias."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": true,
    324         "justification": "Tables 5-6 report resource usage (latency, tokens, cost) alongside performance for each system, allowing compute-performance comparison. The paper discusses the additional overhead of EditFlow (1.71s latency, 6.58K tokens, $0.03 per query)."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "Section 9 (Construct validity) explicitly discusses: 'The notion of mental-flow alignment is inherently abstract, and our operationalization through the Keep/Jump/Revert/Break taxonomy may only approximate developers' cognitive states.'"
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": false,
    333         "answer": false,
    334         "justification": "The paper evaluates Cursor, Claude Code, and CoEdPilot as bundled products/tools. The scaffold IS the thing being tested, so this criterion does not apply."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No discussion of whether the benchmark commits existed before the LLM training cutoffs. Top-starred GitHub repos are very likely in training data."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "The digital twin provides commit messages as edit descriptions to the SUTs, which could leak information about the expected edits. This is not discussed."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": true,
    351         "justification": "Section 7.1.2: 'To avoid intra-commit data leakage, we split the dataset at the commit level in a 7:3 ratio, ensuring that all samples from the same commit are assigned to the same split.'"
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No concrete leakage detection or prevention method is used beyond the commit-level split. No canary strings, membership inference, or decontamination."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "68.81% of model recommendations disrupt developers' ongoing mental flow, including 8.83% that are technically correct but ill-timed.",
    363       "evidence": "Table 1 (Section 5): Cursor shows 28.23% Keep, Claude Code 34.16% Keep, with remaining edits in Jump/Revert/Break categories. Analysis of 50 real-world commits.",
    364       "supported": "moderate"
    365     },
    366     {
    367       "claim": "Auto-tuned prompt achieves 87.26% accuracy on edit order recovery, a 63.81% relative improvement over the best baseline (53.39%).",
    368       "evidence": "Table 3 (Section 7.1.5): Accuracy comparison across zero-shot (50.63%), few-shot (47.42%), hand-crafted (53.27%), DSPy (53.39%), and auto-tuned (87.26%).",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "EditFlow reduces flow violations by over 75% on real-world industrial data.",
    373       "evidence": "Table 4 (Section 7.2.4): Auto-tuned prompt achieves only 30 violations vs. 121 for best baseline (hand-crafted prompt).",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "EditFlow improves recommendation precision by an average of 66.99% across systems and benchmarks.",
    378       "evidence": "Tables 5-6 (Section 7.3.5): Precision improvements shown for Cursor, Claude Code, and CoEdPilot on both large-scale and human-annotated benchmarks.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "User study shows 25.11% faster task completion with EditFlow.",
    383       "evidence": "Table 7 (Section 7.4.6): Average completion times compared across experiment and control groups, but statistical significance only achieved for Task 2 (p=0.0004). Tasks 1 and 3 not statistically significant.",
    384       "supported": "moderate"
    385     }
    386   ],
    387   "methodology_tags": ["benchmark-eval", "rct"],
    388   "key_findings": "EditFlow demonstrates that existing AI code editing tools (Cursor, Claude Code, CoEdPilot) disrupt developers' mental flow in 65-72% of recommendations. A prompt auto-tuning approach recovers edit ordering with 87.26% accuracy. Applied as a post-processing wrapper, EditFlow improves recommendation precision by 66.99% on average and reduces flow violations by 75%. A user study with 32 participants shows 25.11% faster task completion, with statistically significant gains on the most challenging task but not on easier tasks.",
    389   "red_flags": [
    390     {
    391       "flag": "Small user study with no randomization description",
    392       "detail": "32 participants split into 4 groups of 8 each. No power analysis, no description of how participants were assigned to groups, no blinding. Statistical significance achieved only on 1 of 3 tasks for both system comparisons."
    393     },
    394     {
    395       "flag": "Benchmark contamination risk unaddressed",
    396       "detail": "Benchmarks use commits from the most-starred GitHub Python repositories, which are very likely in the training data of the LLMs used. No contamination analysis is performed."
    397     },
    398     {
    399       "flag": "Self-fulfilling evaluation concern partially addressed",
    400       "detail": "The paper acknowledges that flow-aware metrics could be self-fulfilling (Section 7.3.2) and includes a human-annotated benchmark to mitigate, but the large-scale benchmark still uses LLM-inferred partial orders as ground truth."
    401     },
    402     {
    403       "flag": "ByteDance affiliation with no conflict disclosure",
    404       "detail": "One author is from ByteDance, which develops code editing tools. No competing interests statement is provided."
    405     },
    406     {
    407       "flag": "Selective reporting of the 25.11% claim",
    408       "detail": "The headline '25.11% faster task completion' is an average across tasks and groups, but only Task 2 shows statistically significant improvement. Tasks 1 and 3 show no significant difference. The abstract presents this as an unqualified finding."
    409     }
    410   ],
    411   "cited_papers": [
    412     {
    413       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    414       "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"],
    415       "year": 2025,
    416       "doi": "10.48550/arXiv.2507.09089",
    417       "relevance": "Key RCT finding that developers completed tasks 19% slower with AI assistance, directly motivating EditFlow's premise."
    418     },
    419     {
    420       "title": "CoEdPilot: Recommending Code Edits with Learned Prior Edit Relevance, Project-wise Awareness, and Interactive Nature",
    421       "authors": ["Chenyan Liu", "Yufan Cai", "Yun Lin"],
    422       "year": 2024,
    423       "doi": "10.1145/3650212.3652142",
    424       "relevance": "Key baseline system for code edit recommendation that EditFlow wraps and optimizes."
    425     },
    426     {
    427       "title": "Codeplan: Repository-level coding using llms and planning",
    428       "authors": ["Ramakrishna Bairi"],
    429       "year": 2024,
    430       "doi": "10.1145/3643757",
    431       "relevance": "Integrates LLMs with static analysis for code change reasoning, relevant to repository-level code editing."
    432     },
    433     {
    434       "title": "The SPACE of Developer Productivity: There's more to it than you think",
    435       "authors": ["Nicole Forsgren", "Margaret-Anne Storey", "Chandra Maddila", "Thomas Zimmermann"],
    436       "year": 2021,
    437       "doi": "10.1145/3453928",
    438       "relevance": "Industry-standard framework for developer productivity that positions flow as a core dimension."
    439     },
    440     {
    441       "title": "DevEX: What actually drives productivity?",
    442       "authors": ["Abi Noda", "Margaret-Anne Storey", "Nicole Forsgren", "Michaela Greiler"],
    443       "year": 2023,
    444       "doi": "10.1145/3610285",
    445       "relevance": "Developer experience framework positioning flow state as a first-class driver of effective software development."
    446     },
    447     {
    448       "title": "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines",
    449       "authors": ["Omar Khattab"],
    450       "year": 2024,
    451       "relevance": "Baseline prompt optimization framework compared against EditFlow's auto-tuning approach."
    452     },
    453     {
    454       "title": "The impact of LLM-assistants on software developer productivity: A systematic literature review",
    455       "authors": ["Amr Mohamed", "Maram Assi", "Mariam Guizani"],
    456       "year": 2025,
    457       "doi": "10.48550/arXiv.2507.03156",
    458       "relevance": "SLR documenting both positive effects and downsides of LLM coding assistants on developer productivity."
    459     },
    460     {
    461       "title": "Good Vibrations? A Qualitative Study of Co-Creation, Communication, Flow, and Trust in Vibe Coding",
    462       "authors": ["Veronica Pimenova", "Sarah Fakhoury", "Christian Bird", "Margaret-Anne Storey", "Madeline Endres"],
    463       "year": 2025,
    464       "doi": "10.48550/arXiv.2509.12491",
    465       "relevance": "Qualitative evidence on developer flow disruption during AI-assisted coding (vibe coding)."
    466     },
    467     {
    468       "title": "It's weird that it knows what i want: Usability and interactions with copilot for novice programmers",
    469       "authors": ["James Prather"],
    470       "year": 2023,
    471       "doi": "10.1145/3617367",
    472       "relevance": "Documents cognitive load and flow disruption from AI coding assistants for novice programmers."
    473     },
    474     {
    475       "title": "Overwatch: Learning Patterns in Code Edit Sequences",
    476       "authors": ["Yuhao Zhang"],
    477       "year": 2022,
    478       "doi": "10.1145/3563302",
    479       "relevance": "Prior work on learning code edit patterns from IDE-logged traces, directly related to edit recommendation."
    480     }
    481   ]
    482 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs