scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32427B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "EditFlow: Benchmarking and Optimizing Code Edit Recommendation Systems via Reconstruction of Developer Flows",
      6     "authors": [
      7       "Chenyan Liu",
      8       "Yun Lin",
      9       "Jiaxin Chang",
     10       "Jiawei Liu",
     11       "Binhang Qi",
     12       "Bo Jiang",
     13       "Zhiyong Huang",
     14       "Jin Song Dong"
     15     ],
     16     "year": 2026,
     17     "venue": "Proc. ACM Program. Lang. (OOPSLA)",
     18     "arxiv_id": "2602.21697",
     19     "doi": "10.1145/3798249"
     20   },
     21   "checklist": {
     22     "claims_and_evidence": {
     23       "abstract_claims_supported": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Abstract claims (63.81% order reconstruction improvement, 75% flow violation reduction, 66.99% precision improvement, 25.11% faster completion) are all supported by Tables 3-7 in the results sections.",
     27         "source": "opus"
     28       },
     29       "causal_claims_justified": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Causal claims like 'EditFlow improves precision' are supported by controlled comparisons (Original vs w/ EditFlow on same benchmarks). The user study uses random group assignment with statistical testing.",
     33         "source": "opus"
     34       },
     35       "generalization_bounded": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The title claims to benchmark 'Code Edit Recommendation Systems' generally, but evaluation is limited to Python commits only. The paper acknowledges this in Section 9 (External validity) but the title and abstract do not bound to Python.",
     39         "source": "opus"
     40       },
     41       "alternative_explanations_discussed": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Section 9 (Threats to Validity) discusses multiple alternative explanations: digital twin assumes correct developer decisions, edit-order data may not be optimal, violation-based metric may introduce optimistic bias, and LLM stochasticity.",
     45         "source": "opus"
     46       },
     47       "proxy_outcome_distinction": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper explicitly frames 'mental flow' as a cognitive construct and operationalizes it via pairwise edit order relations. It discusses the gap between the proxy (edit ordering) and the construct (cognitive flow state), noting that 'our operationalization through the Keep/Jump/Revert/Break taxonomy may only approximate developers' cognitive states' (Section 9).",
     51         "source": "opus"
     52       }
     53     },
     54     "limitations_and_scope": {
     55       "limitations_section_present": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 9 'Threats to Validity' provides a substantive discussion of external, construct, and internal validity threats.",
     59         "source": "opus"
     60       },
     61       "threats_to_validity_specific": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Section 9 discusses specific threats: Python-only benchmark composition, the single-trajectory limitation of edit order data, the digital twin's assumption of correct developer decisions, and LLM stochasticity affecting prediction stability.",
     65         "source": "opus"
     66       },
     67       "scope_boundaries_stated": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Section 9 explicitly states limitations: 'our data composition may limit the generalizability of our findings to other programming languages, development workflows, or industrial settings.' Also acknowledges the digital twin's simplifying assumptions.",
     71         "source": "opus"
     72       }
     73     },
     74     "conflicts_of_interest": {
     75       "funding_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The Acknowledgments section lists multiple funding sources: National Natural Science Foundation of China, Ministry of Education Singapore, National Research Foundation Singapore, AI Singapore Programme, and Cyber Security Agency of Singapore.",
     79         "source": "opus"
     80       },
     81       "affiliations_disclosed": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Author affiliations are clearly listed: Shanghai Jiao Tong University, National University of Singapore, and ByteDance. Bo Jiang from ByteDance is disclosed.",
     85         "source": "opus"
     86       },
     87       "funder_independent_of_outcome": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Funders are government research agencies (NSFC, Singapore MOE, NRF) that have no commercial stake in the outcome. ByteDance affiliation exists but they are not listed as a funder.",
     91         "source": "opus"
     92       },
     93       "financial_interests_declared": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No competing interests statement is provided. One author is from ByteDance, which develops code editing tools, but no financial interest disclosure is included.",
     97         "source": "opus"
     98       }
     99     },
    100     "scope_and_framing": {
    101       "key_terms_defined": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Key terms are formally defined: Edit Hunk (Definition 1), Pairwise Edit Order (Definition 2), Mental Flow Graph (Definition 3), One-Hop Successor (Definition 4), and flow categories (Definition 5); 'mental flow' is attributed to Csikszentmihalyi 1990.",
    105         "source": "haiku"
    106       },
    107       "intended_contribution_clear": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 1 enumerates five explicit contributions: the mental-flow concept applied to code editing, prompt auto-tuning for edit order recovery, digital twin evaluation framework, empirical validation, and VS Code extension implementation.",
    111         "source": "haiku"
    112       },
    113       "engagement_with_prior_work": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Section 10 situates EditFlow relative to static analysis methods (CCDemon, Overwatch), LLM-based code edit systems (CoEdPilot, GrACE, SARGAM, CoditT5), and developer productivity frameworks (SPACE, DevEx), explaining how EditFlow differs.",
    117         "source": "haiku"
    118       }
    119     }
    120   },
    121   "type_checklist": {
    122     "empirical": {
    123       "artifacts": {
    124         "code_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "The paper provides an anonymous website [3] (sites.google.com/view/editflow) with source code, auto-tuned prompt, dataset, and experiment results. They also implement a VS Code extension.",
    128           "source": "opus"
    129         },
    130         "data_released": {
    131           "applies": true,
    132           "answer": true,
    133           "justification": "The annotated dataset of 100 commits with edit order labels is released via their website. The industrial dataset cannot be released due to compliance restrictions, but the annotated benchmark is available.",
    134           "source": "opus"
    135         },
    136         "environment_specified": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "No mention of requirements.txt, Dockerfile, or detailed environment/dependency specifications in the paper.",
    140           "source": "opus"
    141         },
    142         "reproduction_instructions": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "No step-by-step reproduction instructions are described in the paper. The website is referenced but the paper itself does not include specific commands or a reproduction guide.",
    146           "source": "opus"
    147         }
    148       },
    149       "statistical_methodology": {
    150         "confidence_intervals_or_error_bars": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "Results in Tables 5-7 report point estimates only (percentages) with no confidence intervals or error bars.",
    154           "source": "opus"
    155         },
    156         "significance_tests": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "The user study (Section 7.4.5-7.4.6) uses Mann-Whitney U test with permutation testing (10,000 resamples) and reports p-values for each task comparison.",
    160           "source": "opus"
    161         },
    162         "effect_sizes_reported": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Section 7.4.6 reports effect sizes (r) derived from the standardized U statistic for each task comparison, e.g., r=0.788 for Task 2 EG1 vs CG1.",
    166           "source": "opus"
    167         },
    168         "sample_size_justified": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "The user study uses 32 participants (8 per group) with no power analysis or justification for the sample size.",
    172           "source": "opus"
    173         },
    174         "variance_reported": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "No standard deviations, variance, or spread measures are reported for the benchmark experiments (Tables 5-6). Individual user times are shown in Table 7 but no aggregate variance metrics.",
    178           "source": "opus"
    179         }
    180       },
    181       "evaluation_design": {
    182         "baselines_included": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "The paper compares against zero-shot, few-shot, hand-crafted prompt, and DSPy baselines for order recovery (Table 3), and evaluates Cursor, Claude Code, and CoEdPilot with/without EditFlow (Tables 5-6).",
    186           "source": "opus"
    187         },
    188         "baselines_contemporary": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Baselines include Cursor CLI (2025.09.18), Claude Code (1.0.113), CoEdPilot (2024), and DSPy — all contemporary systems.",
    192           "source": "opus"
    193         },
    194         "ablation_study": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "The comparison of Original vs w/ EditFlow for each baseline system (Tables 5-6) serves as an ablation showing the contribution of the flow-aware optimization component.",
    198           "source": "opus"
    199         },
    200         "multiple_metrics": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Multiple metric categories are used: flow categories (Keep/Jump/Revert/Break), precision/recall/F0.5, and resource usage metrics (latency, tokens, cost).",
    204           "source": "opus"
    205         },
    206         "human_evaluation": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "RQ4 (Section 7.4) presents a user study with 32 developers evaluating real-world task completion and perceived recommendation quality.",
    210           "source": "opus"
    211         },
    212         "held_out_test_set": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Section 7.1.2: 'we split the dataset at the commit level in a 7:3 ratio, ensuring that all samples from the same commit are assigned to the same split' — preventing intra-commit data leakage.",
    216           "source": "opus"
    217         },
    218         "per_category_breakdown": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Results are broken down per baseline system (Cursor, Claude Code, CoEdPilot), per task in the user study (Tasks 1-3), and per flow category (Keep/Jump/Revert/Break).",
    222           "source": "opus"
    223         },
    224         "failure_cases_discussed": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "Section 8 provides a detailed failure analysis with two specific failure modes (false rejection due to k-context sensitivity and acceptance of incorrect flow-keeping edits), including concrete examples.",
    228           "source": "opus"
    229         },
    230         "negative_results_reported": {
    231           "applies": true,
    232           "answer": true,
    233           "justification": "The paper reports that EditFlow does not show statistically significant improvement on Tasks 1 and 3 in the user study, and discusses why (Section 7.4.6). Recall decreases by 7.09% on average.",
    234           "source": "opus"
    235         }
    236       },
    237       "setup_transparency": {
    238         "model_versions_specified": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "Section 7.1.1 specifies 'Claude-Sonnet-4-20250514' with exact snapshot date. Section 7.3.3 gives exact versions: 'Claude Code (Version 1.0.113), Cursor CLI (Version 2025.09.18-7ae6800)'.",
    242           "source": "opus"
    243         },
    244         "prompts_provided": {
    245           "applies": true,
    246           "answer": false,
    247           "justification": "The auto-tuned prompt is central to the method but is not included in the paper. The paper says 'For the detailed learned prompt, please refer to our anonymous website [3]' (Section 6.1). The prompt itself is not in the paper or appendix.",
    248           "source": "opus"
    249         },
    250         "hyperparameters_reported": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "Section 7.1.1: 'maximum output length of 4096 tokens and a temperature of 0.7. The auto-tuning underwent 5 epochs and a batch size of 32.'",
    254           "source": "opus"
    255         },
    256         "scaffolding_described": {
    257           "applies": true,
    258           "answer": false,
    259           "justification": "The paper evaluates Cursor CLI and Claude Code as black boxes ('relied on their default underlying models without manually specifying a particular model'). While EditFlow's own wrapper is described, the underlying systems' scaffolding is not. Per schema rules, this should be NA for the third-party tools, but EditFlow's own scaffolding (filter and re-rank) is described. However, the digital twin integration approach is only briefly described.",
    260           "source": "opus"
    261         },
    262         "data_preprocessing_documented": {
    263           "applies": true,
    264           "answer": true,
    265           "justification": "Section 7.3.2 documents commit selection criteria: '(1) containing 5-10 edit hunks across at least 2 source files; (2) involving real user authorship; (3) excluding merge commits and filename changes; (4) maintaining ASCII-only content with meaningful code modifications.'",
    266           "source": "opus"
    267         }
    268       },
    269       "data_integrity": {
    270         "raw_data_available": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "The annotated dataset and experiment results are stated as available at the anonymous website. The industrial dataset cannot be released.",
    274           "source": "opus"
    275         },
    276         "data_collection_described": {
    277           "applies": true,
    278           "answer": true,
    279           "justification": "Section 7.1.2 describes the annotated dataset: '100 commits from the 45 most-starred open-source GitHub Python repositories, comprising 772 edit hunks and 1,747 directed edges.' Section 7.2.2 describes the industrial dataset: '500 commits from Jun. 2025 to Aug. 2025, containing 3,059 edit hunks.'",
    280           "source": "opus"
    281         },
    282         "recruitment_methods_described": {
    283           "applies": true,
    284           "answer": true,
    285           "justification": "Section 7.4.2: 'We recruited 32 participants from 2 universities' with demographics (age 20-30, CS students from undergraduate to PhD, 4.5 days/week coding, 90% prior AI tool experience). Footnote 3 describes employee consent and anonymization for industrial data.",
    286           "source": "opus"
    287         },
    288         "data_pipeline_documented": {
    289           "applies": true,
    290           "answer": true,
    291           "justification": "The annotation pipeline is described in Section 6.1: independent annotation by two authors, consensus resolution, 20 minutes per commit, 77 person-hours total. The data split is documented (7:3 at commit level).",
    292           "source": "opus"
    293         }
    294       },
    295       "contamination": {
    296         "training_cutoff_stated": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "The paper uses Claude-Sonnet-4-20250514 and Cursor/Claude Code for evaluation but does not state the training data cutoff dates for these models.",
    300           "source": "opus"
    301         },
    302         "train_test_overlap_discussed": {
    303           "applies": true,
    304           "answer": false,
    305           "justification": "The paper does not discuss whether the GitHub commits used in benchmarks could have appeared in the training data of the LLMs used for order recovery or evaluation.",
    306           "source": "opus"
    307         },
    308         "benchmark_contamination_addressed": {
    309           "applies": true,
    310           "answer": false,
    311           "justification": "The benchmark uses commits from popular GitHub repositories (most-starred Python repos). These are highly likely to be in training data for Claude and other models, but contamination risk is not discussed.",
    312           "source": "opus"
    313         }
    314       },
    315       "human_studies": {
    316         "pre_registered": {
    317           "applies": true,
    318           "answer": false,
    319           "justification": "No pre-registration mentioned for the user study with 32 participants.",
    320           "source": "opus"
    321         },
    322         "irb_or_ethics_approval": {
    323           "applies": true,
    324           "answer": false,
    325           "justification": "No IRB or ethics board approval is mentioned for the user study. Footnote 3 mentions employee consent for industrial data but not ethics approval for the user study.",
    326           "source": "opus"
    327         },
    328         "demographics_reported": {
    329           "applies": true,
    330           "answer": true,
    331           "justification": "Section 7.4.2: participants aged 20-30, CS students (undergrad to PhD), coding 4.5 days/week on average, 90% with prior AI tool experience.",
    332           "source": "opus"
    333         },
    334         "inclusion_exclusion_criteria": {
    335           "applies": true,
    336           "answer": true,
    337           "justification": "Section 7.4.2: 'All participants are required to complete a pre-study questionnaire to collect their background information, including educational level, programming proficiency, and prior experience with AI-assisted programming tools.'",
    338           "source": "opus"
    339         },
    340         "randomization_described": {
    341           "applies": true,
    342           "answer": false,
    343           "justification": "The paper describes four groups (CG1, EG1, CG2, EG2) but does not explain how participants were assigned to groups (randomization procedure not described).",
    344           "source": "opus"
    345         },
    346         "blinding_described": {
    347           "applies": true,
    348           "answer": false,
    349           "justification": "No mention of whether participants knew which condition they were in (with or without EditFlow). Blinding is not discussed.",
    350           "source": "opus"
    351         },
    352         "attrition_reported": {
    353           "applies": true,
    354           "answer": true,
    355           "justification": "Table 7 shows results for all 32 participants (P1-P32), 8 per group, with no apparent dropouts. All participants completed all 3 tasks.",
    356           "source": "opus"
    357         }
    358       },
    359       "cost_and_practicality": {
    360         "inference_cost_reported": {
    361           "applies": true,
    362           "answer": true,
    363           "justification": "Tables 5-6 report per-query resource usage including latency (seconds), token usage (K), and monetary cost ($) for each system with and without EditFlow.",
    364           "source": "opus"
    365         },
    366         "compute_budget_stated": {
    367           "applies": true,
    368           "answer": false,
    369           "justification": "No total computational budget (GPU hours, total API spend, total experiment cost) is reported. Only per-query costs are shown.",
    370           "source": "opus"
    371         }
    372       },
    373       "experimental_rigor": {
    374         "seed_sensitivity_reported": {
    375           "applies": true,
    376           "answer": false,
    377           "justification": "No discussion of random seed sensitivity. Section 9 acknowledges LLM stochasticity but does not report results across multiple seeds.",
    378           "source": "opus"
    379         },
    380         "number_of_runs_stated": {
    381           "applies": true,
    382           "answer": false,
    383           "justification": "The paper does not state how many times each experiment was run. The digital twin simulation appears to be single-run.",
    384           "source": "opus"
    385         },
    386         "hyperparameter_search_budget": {
    387           "applies": true,
    388           "answer": false,
    389           "justification": "The prompt auto-tuning uses 5 epochs but the total search budget (number of candidate prompts evaluated, compute cost) is not reported.",
    390           "source": "opus"
    391         },
    392         "best_config_selection_justified": {
    393           "applies": true,
    394           "answer": true,
    395           "justification": "Algorithm 1 describes the prompt selection procedure: accuracy on the full training set is used to select the best prompt at each epoch, which is a clearly defined selection criterion.",
    396           "source": "opus"
    397         },
    398         "multiple_comparison_correction": {
    399           "applies": true,
    400           "answer": false,
    401           "justification": "The user study performs multiple statistical comparisons (3 tasks × 2 system pairs = 6 tests) but no multiple comparison correction (Bonferroni, etc.) is applied.",
    402           "source": "opus"
    403         },
    404         "self_comparison_bias_addressed": {
    405           "applies": true,
    406           "answer": false,
    407           "justification": "The authors evaluate their own EditFlow system against baselines without acknowledging author-evaluation bias.",
    408           "source": "opus"
    409         },
    410         "compute_budget_vs_performance": {
    411           "applies": true,
    412           "answer": true,
    413           "justification": "Tables 5-6 report resource usage (latency, tokens, cost) alongside performance for each system, allowing compute-performance comparison. The paper discusses the additional overhead of EditFlow (1.71s latency, 6.58K tokens, $0.03 per query).",
    414           "source": "opus"
    415         },
    416         "benchmark_construct_validity": {
    417           "applies": true,
    418           "answer": true,
    419           "justification": "Section 9 (Construct validity) explicitly discusses: 'The notion of mental-flow alignment is inherently abstract, and our operationalization through the Keep/Jump/Revert/Break taxonomy may only approximate developers' cognitive states.'",
    420           "source": "opus"
    421         },
    422         "scaffold_confound_addressed": {
    423           "applies": false,
    424           "answer": false,
    425           "justification": "The paper evaluates Cursor, Claude Code, and CoEdPilot as bundled products/tools. The scaffold IS the thing being tested, so this criterion does not apply.",
    426           "source": "opus"
    427         }
    428       },
    429       "data_leakage": {
    430         "temporal_leakage_addressed": {
    431           "applies": true,
    432           "answer": false,
    433           "justification": "No discussion of whether the benchmark commits existed before the LLM training cutoffs. Top-starred GitHub repos are very likely in training data.",
    434           "source": "opus"
    435         },
    436         "feature_leakage_addressed": {
    437           "applies": true,
    438           "answer": false,
    439           "justification": "The digital twin provides commit messages as edit descriptions to the SUTs, which could leak information about the expected edits. This is not discussed.",
    440           "source": "opus"
    441         },
    442         "non_independence_addressed": {
    443           "applies": true,
    444           "answer": true,
    445           "justification": "Section 7.1.2: 'To avoid intra-commit data leakage, we split the dataset at the commit level in a 7:3 ratio, ensuring that all samples from the same commit are assigned to the same split.'",
    446           "source": "opus"
    447         },
    448         "leakage_detection_method": {
    449           "applies": true,
    450           "answer": false,
    451           "justification": "No concrete leakage detection or prevention method is used beyond the commit-level split. No canary strings, membership inference, or decontamination.",
    452           "source": "opus"
    453         }
    454       }
    455     }
    456   },
    457   "claims": [
    458     {
    459       "claim": "68.81% of AI code edit recommendations disrupt developer mental flow, including 8.83% that are technically correct but ill-timed",
    460       "evidence": "Analysis of 50 real-world commits using Cursor and Claude Code via the digital twin framework (Section 5, Table 1)",
    461       "supported": "moderate"
    462     },
    463     {
    464       "claim": "The auto-tuned prompt achieves 87.26% accuracy on edit order recovery, a 63.81% relative improvement over the best baseline (DSPy at 53.39%)",
    465       "evidence": "Table 3 reports accuracy, precision, and F1 across five methods on 871 test samples",
    466       "supported": "strong"
    467     },
    468     {
    469       "claim": "EditFlow reduces flow violations by over 75% compared to the best baseline on a real-world industrial dataset",
    470       "evidence": "Table 4 shows auto-tuned prompt produces 30 violations vs. 121 for the hand-crafted baseline on 500 industrial commits",
    471       "supported": "strong"
    472     },
    473     {
    474       "claim": "EditFlow improves recommendation precision by an average of 66.99% across systems and benchmarks",
    475       "evidence": "Tables 5 and 6 show precision improvements for Cursor (28.5%), Claude Code (24.4%), and CoEdPilot (240%) with and without EditFlow",
    476       "supported": "moderate"
    477     },
    478     {
    479       "claim": "EditFlow leads to 25.11% faster task completion in a controlled user study with 32 developers",
    480       "evidence": "Table 7 and Section 7.4.6 report task times; statistical significance varies by task (significant for Task 2, not significant for Tasks 1 and 3)",
    481       "supported": "moderate"
    482     },
    483     {
    484       "claim": "EditFlow introduces acceptable overhead of 1.71s latency, 6.58K tokens, and $0.03 per query",
    485       "evidence": "Resource usage columns in Tables 5 and 6, averaged across SUTs and benchmarks",
    486       "supported": "strong"
    487     },
    488     {
    489       "claim": "Flow-aware optimization benefits scale with task difficulty — easy/uniform tasks show no significant improvement",
    490       "evidence": "Tasks 1 and 3 show p>0.19 for Claude Code comparisons; Task 2 (hard, ambiguous) shows p=0.0004 with large effect sizes",
    491       "supported": "strong"
    492     }
    493   ],
    494   "methodology_tags": [
    495     "benchmark-eval",
    496     "observational",
    497     "qualitative"
    498   ],
    499   "key_findings": "EditFlow demonstrates that existing AI coding assistants (Cursor, Claude Code) violate developer mental flow in 65–72% of recommendations by optimizing for end-state correctness rather than incremental edit order. A prompt auto-tuning approach achieves 87.26% accuracy in recovering cognitive edit order — 63.81% better than strong baselines including DSPy — enabling a unified post-processing wrapper that improves recommendation precision by 66.99% and reduces flow violations by 75%. A controlled user study with 32 participants confirms 25.11% faster task completion, with benefits concentrated on moderately difficult tasks where cognitive sequencing matters and existing tools produce confusing suggestions. Flow-aware optimization provides little benefit for simple uniform refactoring tasks where any edit order is cognitively valid.",
    500   "red_flags": [
    501     {
    502       "flag": "Circular ground truth in large-scale benchmark",
    503       "detail": "RQ3 uses LLM-inferred partial order graphs (from the same model being evaluated) as ground truth for computing flow-aware metrics; while a human-annotated benchmark is included as a cross-check, the large-scale 500-commit benchmark relies entirely on LLM-derived labels as ground truth."
    504     },
    505     {
    506       "flag": "No significance tests for main benchmark results",
    507       "detail": "The headline claims of 63.81% accuracy improvement (RQ1), 75% violation reduction (RQ2), and 66.99% precision improvement (RQ3) have no p-values or confidence intervals despite being comparative claims."
    508     },
    509     {
    510       "flag": "User study groups of 8 participants each",
    511       "detail": "32 participants across 4 groups (EG1, CG1, EG2, CG2) yields only 8 per group, far too few for robust subgroup analysis; several task comparisons are non-significant."
    512     },
    513     {
    514       "flag": "Benchmark contamination unaddressed",
    515       "detail": "The test benchmark draws from 45–80 most-starred public GitHub Python repositories that almost certainly appear in Claude Sonnet's training data; the paper neither acknowledges nor investigates this potential contamination."
    516     },
    517     {
    518       "flag": "No IRB or ethics approval for human study",
    519       "detail": "The 32-participant controlled study involves task-based evaluation with screen recording but no IRB approval or ethics review is mentioned."
    520     },
    521     {
    522       "flag": "No blinding or randomization in user study",
    523       "detail": "Participants were assigned to groups without described randomization and were not blinded to their condition (they used the EditFlow VS Code extension explicitly), introducing potential Hawthorne and demand effects."
    524     },
    525     {
    526       "flag": "Key artifact (auto-tuned prompt) not in paper",
    527       "detail": "The primary learned artifact driving all results is hosted at an anonymous website rather than included in the paper, meaning reproducibility depends on external link availability."
    528     }
    529   ],
    530   "cited_papers": [
    531     {
    532       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    533       "relevance": "Core motivation: Becker et al. 2025 RCT showing 19% slower task completion with Cursor+Claude, directly motivating the EditFlow investigation"
    534     },
    535     {
    536       "title": "CoEdPilot: Recommending Code Edits with Learned Prior Edit Relevance, Project-wise Awareness, and Interactive Nature",
    537       "relevance": "Primary academic baseline system for edit recommendation; same research group as EditFlow"
    538     },
    539     {
    540       "title": "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines",
    541       "relevance": "Prompt optimization baseline directly compared against the auto-tuning approach"
    542     },
    543     {
    544       "title": "The SPACE of Developer Productivity: There's more to it than you think",
    545       "relevance": "Framework situating mental flow as a first-class productivity dimension, supporting the paper's theoretical framing"
    546     },
    547     {
    548       "title": "The cost of interrupted work: more speed and stress",
    549       "relevance": "Quantifies 23-minute recovery tax after interruptions, motivating flow-preservation as a productivity lever"
    550     },
    551     {
    552       "title": "DevEX: What actually drives productivity?",
    553       "relevance": "Industry framework treating flow state as one of three core developer productivity dimensions"
    554     },
    555     {
    556       "title": "CodePlan: Repository-level coding using LLMs and planning",
    557       "relevance": "Related work on LLM-based code editing with dependency graphs, contrasted with EditFlow's flow-aware approach"
    558     },
    559     {
    560       "title": "'It's weird that it knows what I want': Usability and interactions with Copilot for novice programmers",
    561       "relevance": "Documents flow disruption from unwanted AI suggestions and cognitive load from constant verification"
    562     }
    563   ],
    564   "engagement_factors": {
    565     "practical_relevance": {
    566       "score": 3,
    567       "justification": "Releases a VS Code extension wrapping Cursor and Claude Code; directly actionable by any developer using these tools today."
    568     },
    569     "surprise_contrarian": {
    570       "score": 2,
    571       "justification": "Challenges the assumption that more accurate AI suggestions improve productivity; anchors on the Becker et al. finding that developers are 19% slower with AI assistance."
    572     },
    573     "fear_safety": {
    574       "score": 0,
    575       "justification": "No AI safety or risk concerns raised; paper is about developer UX optimization."
    576     },
    577     "drama_conflict": {
    578       "score": 1,
    579       "justification": "Implicitly critiques Cursor and Claude Code as productivity-degrading; references controlled trial showing AI-assisted developers are slower."
    580     },
    581     "demo_ability": {
    582       "score": 3,
    583       "justification": "VS Code extension with demo videos available at project homepage; users can install and try it with their existing Cursor or Claude Code setup."
    584     },
    585     "brand_recognition": {
    586       "score": 2,
    587       "justification": "Evaluates Cursor and Claude Code (high brand recognition tools); published at OOPSLA (top venue); authors from NUS and SJTU (reputable institutions)."
    588     }
    589   },
    590   "hn_data": {
    591     "threads": [],
    592     "top_points": 0,
    593     "total_points": 0,
    594     "total_comments": 0
    595   }
    596 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs