scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28225B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "EditFlow: Benchmarking and Optimizing Code Edit Recommendation Systems via Reconstruction of Developer Flows",
      6     "authors": [
      7       "Chenyan Liu",
      8       "Yun Lin",
      9       "Jiaxin Chang",
     10       "Jiawei Liu",
     11       "Binhang Qi",
     12       "Bo Jiang",
     13       "Zhiyong Huang",
     14       "Jin Song Dong"
     15     ],
     16     "year": 2026,
     17     "venue": "Proc. ACM Program. Lang. (OOPSLA)",
     18     "arxiv_id": "2602.21697",
     19     "doi": "10.1145/3798249"
     20   },
     21   "checklist": {
     22     "claims_and_evidence": {
     23       "abstract_claims_supported": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "All major abstract claims (63.81% accuracy improvement, 75% violation reduction, 66.99% precision boost, 25.11% task speedup) are traced to specific tables (Tables 3–7) and experiments in the paper.",
     27         "source": "haiku"
     28       },
     29       "causal_claims_justified": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Causal claims are supported by controlled digital twin simulation (identical commit inputs, original vs. w/EditFlow configurations) and a controlled user study comparing treatment vs. control groups across 3 tasks with statistical tests for the user study.",
     33         "source": "haiku"
     34       },
     35       "generalization_bounded": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Section 9 (Threats to Validity) explicitly bounds external validity to Python commits on GitHub repositories, acknowledging that generalization to other languages and workflows requires further work.",
     39         "source": "haiku"
     40       },
     41       "alternative_explanations_discussed": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper discusses failure modes but does not consider alternative explanations for EditFlow's benefit (e.g., simple suggestion reduction reducing cognitive overload regardless of flow reasoning, or selection bias in the annotated dataset).",
     45         "source": "haiku"
     46       },
     47       "proxy_outcome_distinction": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper distinguishes flow-aware metrics (Keep/Jump/Revert/Break), flow-independent metrics (Precision/Recall/F0.5), resource metrics, and user study task-completion time as separate measurement levels aligned to different claims.",
     51         "source": "haiku"
     52       }
     53     },
     54     "limitations_and_scope": {
     55       "limitations_section_present": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 9 is a dedicated Threats to Validity section covering external, construct, and internal validity in detail.",
     59         "source": "haiku"
     60       },
     61       "threats_to_validity_specific": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Specific threats named include: Python-only benchmark, single observed trajectory per commit biasing violation metrics, 1-context sensitivity in EditFlow filtering, LLM stochasticity in order inference, and digital twin's assumption that developers always make correct decisions.",
     65         "source": "haiku"
     66       },
     67       "scope_boundaries_stated": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The paper explicitly states scope is limited to Python, GitHub-sourced commits, and that the industrial dataset cannot be released; it notes findings may not generalize to other languages or non-GitHub workflows.",
     71         "source": "haiku"
     72       }
     73     },
     74     "conflicts_of_interest": {
     75       "funding_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Acknowledgments list specific grants: NSFC (62572300), Singapore MOE (MOE-T2EP20124-0017, MOET32020-0004), NRF, DSO National Laboratories (AISG2-GC-2023-008-1B), and Cyber Security Agency of Singapore.",
     79         "source": "haiku"
     80       },
     81       "affiliations_disclosed": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "All author affiliations are disclosed in the header: Shanghai Jiao Tong University, National University of Singapore, and Bytedance Network Technology for co-author Bo Jiang.",
     85         "source": "haiku"
     86       },
     87       "funder_independent_of_outcome": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "All listed funders are government agencies (China NSFC, Singapore MOE/NRF/DSO) independent of the code editing tools evaluated (Cursor, Claude Code, CoEdPilot); Bytedance is an author affiliation, not a funder.",
     91         "source": "haiku"
     92       },
     93       "financial_interests_declared": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No competing interests or financial interests statement is provided; Bytedance co-author Bo Jiang's potential interest in AI coding tools is not disclosed.",
     97         "source": "haiku"
     98       }
     99     },
    100     "scope_and_framing": {
    101       "key_terms_defined": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Key terms are formally defined: Edit Hunk (Def. 1), Pairwise Edit Order with labels {≺, ≻, ∼, ⊥} (Def. 2), Mental Flow Graph (Def. 3), One-Hop Successor (Def. 4), and mental flow (cited from Csikszentmihalyi 1990).",
    105         "source": "haiku"
    106       },
    107       "intended_contribution_clear": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Five explicit contributions are enumerated in Section 1: the mental flow framing, prompt auto-tuning strategy, digital twin evaluation framework, empirical demonstration, and VS Code extension implementation.",
    111         "source": "haiku"
    112       },
    113       "engagement_with_prior_work": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Section 10 situates EditFlow against static analysis methods (CCDemon, Overwatch, Pyevolve), LLM-based editors (CoditT5, GrACE, SARGAM, CoEdPilot), and developer productivity frameworks (SPACE, DevEx), showing explicit differentiation.",
    117         "source": "haiku"
    118       }
    119     }
    120   },
    121   "type_checklist": {
    122     "empirical": {
    123       "artifacts": {
    124         "code_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "The Data-Availability Statement and repeated references to [3] confirm source code, auto-tuned prompt, dataset, and results are available at sites.google.com/view/editflow (not 'upon request').",
    128           "source": "haiku"
    129         },
    130         "data_released": {
    131           "applies": true,
    132           "answer": true,
    133           "justification": "The annotated dataset (100 commits, 2,030 training + 871 test samples) is available at the anonymous website; the industrial dataset is explicitly withheld due to compliance restrictions.",
    134           "source": "haiku"
    135         },
    136         "environment_specified": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "Model version (Claude-Sonnet-4-20250514) and hyperparameters are specified but no requirements.txt, Dockerfile, or dependency list is provided for reproducing the experimental environment.",
    140           "source": "haiku"
    141         },
    142         "reproduction_instructions": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "The paper refers readers to the anonymous website for the learned prompt and algorithms but provides no step-by-step instructions within the paper sufficient to reproduce experiments without guessing.",
    146           "source": "haiku"
    147         }
    148       },
    149       "statistical_methodology": {
    150         "confidence_intervals_or_error_bars": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "Tables 3–6 report point estimates only; the user study (Table 7) reports p-values and effect sizes but no confidence intervals or error bars anywhere in the paper.",
    154           "source": "haiku"
    155         },
    156         "significance_tests": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "Statistical tests (Mann-Whitney U with permutation testing) are used only for the user study (RQ4); the main technical evaluations in RQ1–RQ3 make comparative claims without any significance testing.",
    160           "source": "haiku"
    161         },
    162         "effect_sizes_reported": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Effect sizes (r derived from Mann-Whitney U statistic) are reported for all user study comparisons; percentage improvements with baseline context are reported for RQ1–RQ3.",
    166           "source": "haiku"
    167         },
    168         "sample_size_justified": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "The 32-participant user study and 100-commit annotated dataset are not justified with power analysis or sample size rationale; 8 participants per group is not defended.",
    172           "source": "haiku"
    173         },
    174         "variance_reported": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "Tables 3–6 report means with no standard deviation or variance; Table 7 shows individual times enabling variance computation but the paper only reports group averages.",
    178           "source": "haiku"
    179         }
    180       },
    181       "evaluation_design": {
    182         "baselines_included": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "RQ1 compares against zero-shot, few-shot, hand-crafted prompt, and DSPy; RQ3 compares Cursor/Claude Code/CoEdPilot original vs. w/EditFlow on two benchmarks.",
    186           "source": "haiku"
    187         },
    188         "baselines_contemporary": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Baselines include DSPy (2024), Claude Code (v1.0.113), Cursor CLI (2025.09.18), and CoEdPilot (ISSTA 2024), all contemporary and representative systems.",
    192           "source": "haiku"
    193         },
    194         "ablation_study": {
    195           "applies": true,
    196           "answer": false,
    197           "justification": "The paper compares original vs. w/EditFlow end-to-end but does not ablate individual EditFlow components (prompt auto-tuning alone, filtering alone, re-ranking alone) to isolate their contributions.",
    198           "source": "haiku"
    199         },
    200         "multiple_metrics": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Four metric categories are used: flow-aware (Keep/Jump/Revert/Break), flow-independent (Precision/Recall/F0.5), resource usage (latency/tokens/cost), and user study (task completion time, Mann-Whitney statistics).",
    204           "source": "haiku"
    205         },
    206         "human_evaluation": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "RQ4 is a controlled user study with 32 participants completing 3 real-world editing tasks, measuring task completion time and perceived recommendation quality.",
    210           "source": "haiku"
    211         },
    212         "held_out_test_set": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "For RQ1, the annotated dataset is split 7:3 at the commit level to prevent intra-commit data leakage, with 871 held-out test samples used for final evaluation.",
    216           "source": "haiku"
    217         },
    218         "per_category_breakdown": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Results are broken down by system (Cursor, Claude Code, CoEdPilot) in Tables 5–6 and by task (T1, T2, T3) in Table 7, with per-participant breakdowns provided.",
    222           "source": "haiku"
    223         },
    224         "failure_cases_discussed": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "Section 8 is entirely devoted to failure analysis, presenting two concrete failure modes (false rejection due to k-context sensitivity, false acceptance of locally coherent but incorrect edits) with specific examples and metric implications.",
    228           "source": "haiku"
    229         },
    230         "negative_results_reported": {
    231           "applies": true,
    232           "answer": true,
    233           "justification": "Task 1 (p=0.1966) and Task 3 (p=0.2186) show no statistically significant improvement from EditFlow; the paper analyzes why for each task rather than dismissing the null results.",
    234           "source": "haiku"
    235         }
    236       },
    237       "setup_transparency": {
    238         "model_versions_specified": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "Exact versions are given: Claude-Sonnet-4-20250514 for prompt tuning, Claude Code Version 1.0.113, Cursor CLI Version 2025.09.18-7ae6800.",
    242           "source": "haiku"
    243         },
    244         "prompts_provided": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "The auto-tuned prompt is available at the anonymous website [3] (sites.google.com/view/editflow); an example edit hunk representation (Table 2) and algorithm pseudocode (Algorithm 1) are provided in the paper.",
    248           "source": "haiku"
    249         },
    250         "hyperparameters_reported": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "Temperature (0.7), max output length (4096), number of epochs (5), and batch size (32) are all reported for the prompt auto-tuning experiment in Section 7.1.1.",
    254           "source": "haiku"
    255         },
    256         "scaffolding_described": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "Section 7.3.3 describes how the digital twin interacts with each system: Claude Code SDK and Cursor CLI in headless mode, specific formatting for CoEdPilot's discriminator/locator/generator pipeline.",
    260           "source": "haiku"
    261         },
    262         "data_preprocessing_documented": {
    263           "applies": true,
    264           "answer": true,
    265           "justification": "Commit selection criteria are enumerated (5–10 hunks, ≥2 source files, ASCII-only, no merge commits, no filename changes); annotation process documented (2 independent annotators, 20 min/commit, consensus resolution, 77 person-hours).",
    266           "source": "haiku"
    267         }
    268       },
    269       "data_integrity": {
    270         "raw_data_available": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "The annotated dataset (commits, edit hunks, pairwise labels) is available at the anonymous website; the industrial dataset is withheld for compliance reasons, noted explicitly.",
    274           "source": "haiku"
    275         },
    276         "data_collection_described": {
    277           "applies": true,
    278           "answer": true,
    279           "justification": "Data collection is documented for the annotated set (100 commits from 45 most-starred Python repos, pairwise annotation with inter-annotator agreement) and industrial set (500 commits Jun–Aug 2025, employee consent, anonymization, secure environment).",
    280           "source": "haiku"
    281         },
    282         "recruitment_methods_described": {
    283           "applies": true,
    284           "answer": false,
    285           "justification": "Participants are described as recruited from 2 universities but no recruitment method (posting, course credit, snowball, etc.), compensation, or selection process is stated.",
    286           "source": "haiku"
    287         },
    288         "data_pipeline_documented": {
    289           "applies": true,
    290           "answer": true,
    291           "justification": "The pipeline from GitHub commit selection → edit hunk extraction → pairwise annotation → train/test split → prompt optimization → digital twin evaluation is described step-by-step across Sections 6–7.",
    292           "source": "haiku"
    293         }
    294       },
    295       "contamination": {
    296         "training_cutoff_stated": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "Claude-Sonnet-4-20250514's training cutoff is not stated; the paper uses the model for both prompt optimization and evaluation on GitHub commits without addressing whether those commits predate the cutoff.",
    300           "source": "haiku"
    301         },
    302         "train_test_overlap_discussed": {
    303           "applies": true,
    304           "answer": false,
    305           "justification": "The paper splits commits to avoid intra-dataset leakage but does not discuss whether the GitHub repository commits used for benchmarking were included in Claude's pretraining data.",
    306           "source": "haiku"
    307         },
    308         "benchmark_contamination_addressed": {
    309           "applies": true,
    310           "answer": false,
    311           "justification": "The benchmark draws from top-starred GitHub Python repositories (e.g., kovidgoyal/kitty, getsentry/sentry) that are almost certainly in Claude's training corpus; this is not acknowledged.",
    312           "source": "haiku"
    313         }
    314       },
    315       "human_studies": {
    316         "pre_registered": {
    317           "applies": true,
    318           "answer": false,
    319           "justification": "No pre-registration is mentioned for the 32-participant user study.",
    320           "source": "haiku"
    321         },
    322         "irb_or_ethics_approval": {
    323           "applies": true,
    324           "answer": false,
    325           "justification": "No IRB or ethics approval is mentioned despite conducting a human subjects study at two universities.",
    326           "source": "haiku"
    327         },
    328         "demographics_reported": {
    329           "applies": true,
    330           "answer": true,
    331           "justification": "Age range (20–30), educational level (undergraduate to PhD in CS), programming frequency (4.5 days/week), and prior AI tool experience (90%) are reported in Section 7.4.2.",
    332           "source": "haiku"
    333         },
    334         "inclusion_exclusion_criteria": {
    335           "applies": true,
    336           "answer": false,
    337           "justification": "Requirements include CS enrollment and completing a pre-study questionnaire, but no formal inclusion/exclusion criteria are stated (e.g., minimum Python experience threshold, familiarity with the tools).",
    338           "source": "haiku"
    339         },
    340         "randomization_described": {
    341           "applies": true,
    342           "answer": false,
    343           "justification": "The assignment of participants to the four groups (CG1/EG1/CG2/EG2) is never described; it is unknown whether random assignment was used.",
    344           "source": "haiku"
    345         },
    346         "blinding_described": {
    347           "applies": true,
    348           "answer": false,
    349           "justification": "No blinding is described; participants clearly know whether they are using EditFlow-wrapped or original systems given the VS Code extension interface.",
    350           "source": "haiku"
    351         },
    352         "attrition_reported": {
    353           "applies": true,
    354           "answer": true,
    355           "justification": "Table 7 shows all 32 participants (P1–P32) with complete data for all 3 tasks, implying no attrition, though dropout is not explicitly addressed.",
    356           "source": "haiku"
    357         }
    358       },
    359       "cost_and_practicality": {
    360         "inference_cost_reported": {
    361           "applies": true,
    362           "answer": true,
    363           "justification": "Tables 5 and 6 report per-query latency (seconds), token usage (K), and monetary cost ($) for each system configuration including the EditFlow overhead.",
    364           "source": "haiku"
    365         },
    366         "compute_budget_stated": {
    367           "applies": true,
    368           "answer": false,
    369           "justification": "Per-query costs are reported but total compute budget for running the full set of experiments (500 commits × multiple systems × multiple RQs) is not stated.",
    370           "source": "haiku"
    371         }
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "68.81% of AI code edit recommendations from Cursor and Claude Code disrupt developers' mental flow",
    378       "evidence": "Empirical study on 50 manually annotated Python commits using the digital twin framework (Section 5, Table 1): Keep edits are 28.23% (Cursor) and 34.16% (Claude Code), with Break edits dominant at 55.48% and 51.23% respectively",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Auto-tuned prompt achieves 63.81% relative improvement in edit order recovery accuracy over best baseline",
    383       "evidence": "Table 3: auto-tuned prompt achieves 87.26% accuracy vs. DSPy's 53.39% best baseline (63.81% relative improvement); consistent across precision (88.01%) and F1 (87.54%)",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "EditFlow reduces flow violations on real-world industrial data by over 75% compared to best baseline",
    388       "evidence": "Table 4: auto-tuned prompt yields 30 violations vs. 121 for hand-crafted prompt (best baseline) on 500 industrial commits from a 60K+ employee IT company",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "EditFlow improves edit recommendation precision by 66.99% on average across systems and benchmarks",
    393       "evidence": "Tables 5–6: Cursor improves from 33.02%→42.42% and 44.05%→53.53%; Claude Code from 40.54%→50.45% and 39.68%→48.96%; CoEdPilot from 14.78%→35.50% and 10.00%→26.39%",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "EditFlow leads to 25.11% faster task completion in a controlled user study with 32 developers",
    398       "evidence": "Table 7: aggregate average across groups and tasks; statistically significant on Task 2 (p=0.0004, r=0.788 for EG1 vs CG1; p=0.0004, r=0.840 for EG2 vs CG2) but not Task 1 or Task 3",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "EditFlow is effective specifically for complex tasks requiring deep codebase understanding, not simple refactoring",
    403       "evidence": "Task 2 (hard, cross-file cascading change): strong significant improvement; Task 3 (uniform refactoring): no significant improvement (p=0.2186); explicitly analyzed as boundary conditions in Section 7.4.6",
    404       "supported": "strong"
    405     }
    406   ],
    407   "methodology_tags": [
    408     "benchmark-eval",
    409     "case-study",
    410     "observational"
    411   ],
    412   "key_findings": "EditFlow addresses the disconnect between AI code editing accuracy and developer productivity by framing the problem as mental flow alignment. A prompt auto-tuning strategy achieves 87.26% accuracy in recovering pairwise edit orders, outperforming zero-shot, few-shot, hand-crafted, and DSPy approaches by 63.81% relative. Wrapping existing AI coding assistants (Cursor, Claude Code, CoEdPilot) with EditFlow's flow-aware filter improves recommendation precision by 66.99% on average and reduces flow violations by 75%+ on industrial data. A 32-participant user study confirms 25.11% faster task completion, with strongest gains on complex multi-file tasks and no significant benefit on uniform refactoring tasks where any edit order is cognitively valid.",
    413   "red_flags": [
    414     {
    415       "flag": "Tiny user study groups",
    416       "detail": "8 participants per group (4 groups, 32 total) is far too small for reliable subgroup analysis or generalization; the overall 25.11% speedup conflates results from asymmetric task difficulties and heterogeneous systems."
    417     },
    418     {
    419       "flag": "Randomization not described",
    420       "detail": "Section 7.4.2 does not describe how participants were assigned to the four groups (CG1/EG1/CG2/EG2), making it impossible to assess selection bias."
    421     },
    422     {
    423       "flag": "No IRB or ethics disclosure",
    424       "detail": "A human subjects study at two universities with screen recordings and interaction logging is conducted without any mention of ethics review or participant consent beyond the industrial data note."
    425     },
    426     {
    427       "flag": "Benchmark contamination unaddressed",
    428       "detail": "The benchmark uses top-starred GitHub Python repos (kovidgoyal/kitty, getsentry/sentry, etc.) almost certainly included in Claude's pretraining corpus; Claude is also used to infer ground-truth edit orders for evaluation."
    429     },
    430     {
    431       "flag": "No confidence intervals or significance tests for main results",
    432       "detail": "Tables 3–6 report point estimates only; the large precision improvements (e.g., CoEdPilot: 14.78%→35.50%) have no variance, CI, or statistical test, making their reliability unassessable."
    433     },
    434     {
    435       "flag": "No component ablation",
    436       "detail": "EditFlow has three interacting components (prompt auto-tuning, digital twin, flow-aware filtering); the paper only evaluates the full system vs. original, making it impossible to attribute gains to individual components."
    437     },
    438     {
    439       "flag": "Digital twin as ground truth",
    440       "detail": "For RQ3, the auto-tuned prompt itself generates the flow graph used as ground truth for evaluation, creating circularity: EditFlow uses the same prompt for filtering that defines what 'Keep' means in the evaluation metrics."
    441     }
    442   ],
    443   "cited_papers": [
    444     {
    445       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    446       "relevance": "Key citation establishing the 19% productivity slowdown with AI assistance that motivates the entire EditFlow framing; RCT by Becker et al. 2025"
    447     },
    448     {
    449       "title": "CoEdPilot: Recommending Code Edits with Learned Prior Edit Relevance, Project-wise Awareness, and Interactive Nature",
    450       "relevance": "Primary academic baseline system for subsequent edit recommendation; prior work by the same first author (Liu et al. 2024, ISSTA)"
    451     },
    452     {
    453       "title": "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines",
    454       "relevance": "Baseline prompt optimization framework compared against EditFlow's auto-tuning approach"
    455     },
    456     {
    457       "title": "The SPACE of Developer Productivity: There's more to it than you think",
    458       "relevance": "Industry-standard framework positioning Flow as one of five dimensions of developer productivity, used to ground the mental flow construct"
    459     },
    460     {
    461       "title": "DevEX: What actually drives productivity?",
    462       "relevance": "Framework identifying flow state as a core driver of developer productivity, used alongside SPACE to justify the mental flow framing"
    463     },
    464     {
    465       "title": "The cost of interrupted work: more speed and stress",
    466       "relevance": "Empirical evidence that interruptions require 23 minutes 15 seconds recovery time, providing quantitative grounding for why flow disruptions matter"
    467     },
    468     {
    469       "title": "Grace: Language Models Meet Code Edits",
    470       "relevance": "Prior work on incorporating prior edits as context for code edit recommendation, directly related to EditFlow's problem space"
    471     },
    472     {
    473       "title": "CodePlan: Repository-level coding using LLMs and planning",
    474       "relevance": "Related approach using LLMs with static analysis for reasoning over code changes across files"
    475     }
    476   ],
    477   "engagement_factors": {
    478     "practical_relevance": {
    479       "score": 3,
    480       "justification": "Implements an actual VS Code extension wrapping Cursor, Claude Code, and CoEdPilot; addresses the real gap between benchmark accuracy and developer productivity that affects everyday coding tool use."
    481     },
    482     "surprise_contrarian": {
    483       "score": 3,
    484       "justification": "Directly challenges the assumption that higher benchmark accuracy leads to better developer outcomes, citing a controlled trial showing 19% slowdown; reframes the problem from accuracy to cognitive flow."
    485     },
    486     "fear_safety": {
    487       "score": 0,
    488       "justification": "No AI safety or risk concerns raised; the paper is focused on productivity optimization."
    489     },
    490     "drama_conflict": {
    491       "score": 2,
    492       "justification": "Explicitly demonstrates that Cursor and Claude Code—the dominant commercial tools—disrupt mental flow in the majority of recommendations, which challenges their marketing claims."
    493     },
    494     "demo_ability": {
    495       "score": 3,
    496       "justification": "VS Code extension is implemented and available with demonstration videos at the anonymous website; practitioners can immediately install and try EditFlow with their existing Cursor or Claude Code setup."
    497     },
    498     "brand_recognition": {
    499       "score": 2,
    500       "justification": "Directly evaluates Claude Code and Cursor (the two most prominent AI coding tools in 2025–2026) as the systems under test, lending immediate relevance to the broader developer community."
    501     }
    502   },
    503   "hn_data": {
    504     "threads": [],
    505     "top_points": 0,
    506     "total_points": 0,
    507     "total_comments": 0
    508   }
    509 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs