scan.json (30523B)
1 { 2 "paper": { 3 "title": "HAI-Eval: Measuring Human-AI Synergy in Collaborative Coding", 4 "authors": ["Hanjun Luo", "Chiming Ni", "Jiaheng Wen", "Zhimu Huang", "Yiran Wang", "Bingduo Liao", "Sylvia Chung", "Yingbin Jin", "Xinfeng Li", "Wenyuan Xu", "XiaoFeng Wang", "Hanan Salam"], 5 "year": 2025, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2512.04111", 8 "doi": "10.48550/arXiv.2512.04111" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval", "rct"], 13 "key_findings": "HAI-Eval introduces 'collaboration-necessary' coding tasks where standalone LLMs achieve near-zero pass rates (best pass@10: 4.22%) and unaided humans achieve only 18.89%, but human-AI collaboration raises performance to 31.11%. The within-subject study with 45 participants reveals that high-performing developers use AI not just for implementation but for strategic co-reasoning, with 51% adopting fundamentally different approaches suggested by AI. LLM performance is uniformly poor across all difficulty levels and professional tracks, suggesting the bottleneck is higher-order reasoning rather than algorithmic complexity.", 14 "claims": [ 15 { 16 "claim": "SOTA LLMs achieve near-zero pass rates on HAI-Eval, with the best model achieving only 4.22% overall pass@10", 17 "evidence": "Table 1 shows all five tested LLMs achieving 0-0.67% pass@1 under C0 and 0-2.89% under C1, with Claude-Sonnet-4 achieving best pass@10 of 4.22% under C1.", 18 "supported": "strong" 19 }, 20 { 21 "claim": "Human-AI collaboration significantly improves performance from 18.89% to 31.11% overall pass@1", 22 "evidence": "Table 2 shows CH=18.89% vs C2=31.11% averaged across difficulty levels. The paper states this is 'statistically significant' but does not report the specific test or p-value.", 23 "supported": "moderate" 24 }, 25 { 26 "claim": "LLMs are emerging as co-reasoning partners, not just implementation tools", 27 "evidence": "Based on self-reported participant feedback (Figure 4, Table 10): 80% used AI for brainstorming, 51% adopted a fundamentally different approach proposed by AI. Case study in Appendix L illustrates one example.", 28 "supported": "weak" 29 }, 30 { 31 "claim": "Collaborative performance remains stable across difficulty levels while standalone human performance degrades sharply", 32 "evidence": "Table 2 shows C2 overall pass@1: Easy 43.33%, Medium 26.67%, Hard 23.33% vs CH: Easy 36.67%, Medium 13.33%, Hard 6.67%.", 33 "supported": "moderate" 34 }, 35 { 36 "claim": "LLM failure is uniform across difficulty levels and professional tracks, suggesting a fundamental reasoning bottleneck", 37 "evidence": "Tables 6 and 7 in Appendix K show consistently near-zero pass rates across all difficulty levels and tracks for all models.", 38 "supported": "strong" 39 } 40 ], 41 "checklist": { 42 "artifacts": { 43 "code_released": { 44 "applies": true, 45 "answer": false, 46 "justification": "The abstract states 'Our benchmark and interactive demo will be openly accessible' — future tense with no URL provided. No repository link appears anywhere in the paper." 47 }, 48 "data_released": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper describes releasing '450 manually-curated static tasks' and states 'the dataset is released as a standalone resource,' but provides no download URL or repository link. The abstract uses future tense ('will be openly accessible')." 52 }, 53 "environment_specified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper describes devcontainer files and Docker-based environments conceptually but provides no actual environment specification files, requirements.txt, or dependency versions for reproducing the evaluation setup." 57 }, 58 "reproduction_instructions": { 59 "applies": true, 60 "answer": false, 61 "justification": "No step-by-step reproduction instructions are provided. The paper describes the evaluation pipeline architecture but does not include commands or procedures to replicate the experiments." 62 } 63 }, 64 "statistical_methodology": { 65 "confidence_intervals_or_error_bars": { 66 "applies": true, 67 "answer": false, 68 "justification": "Main results (Tables 1, 2, 6, 7, 8) report only point estimates for pass rates, partial pass rates, completion times, and token usage. No confidence intervals or error bars on the primary performance metrics. SD is only reported for Likert-scale feedback items (Tables 3, 9)." 69 }, 70 "significance_tests": { 71 "applies": true, 72 "answer": false, 73 "justification": "The paper claims 'a statistically significant improvement' (Section 5.2) and states 'Statistical comparisons use appropriate tests with averaged results' (Section 5.1) but never specifies which tests were used, reports no p-values, and provides no test statistics." 74 }, 75 "effect_sizes_reported": { 76 "applies": true, 77 "answer": true, 78 "justification": "Results are reported with baseline context throughout: e.g., pass@1 rises from 18.89% (CH) to 31.11% (C2); Tables 1, 2, 6, 7 include absolute values and delta columns showing improvement magnitudes." 79 }, 80 "sample_size_justified": { 81 "applies": true, 82 "answer": false, 83 "justification": "The study uses 45 participants (15 per track) and 450 task instances for LLMs (10 per template). No power analysis or justification for why these specific sample sizes were chosen." 84 }, 85 "variance_reported": { 86 "applies": true, 87 "answer": false, 88 "justification": "Main experimental results (pass rates across conditions) are reported as single aggregate numbers without standard deviations, interquartile ranges, or any spread measure. SD appears only for Likert-scale feedback in Tables 3 and 9." 89 } 90 }, 91 "evaluation_design": { 92 "baselines_included": { 93 "applies": true, 94 "answer": true, 95 "justification": "Four conditions serve as baselines: CH (human-only), C0 (fully autonomous AI), C1 (minimally-intervened AI), enabling comparison against C2 (human-AI collaboration)." 96 }, 97 "baselines_contemporary": { 98 "applies": true, 99 "answer": true, 100 "justification": "Five SOTA models evaluated as of July 2025: Claude-Sonnet-4, Claude-Sonnet-3.7, GPT-4.1, GPT-4o, and Gemini-2.5-Pro. All are current at time of study." 101 }, 102 "ablation_study": { 103 "applies": true, 104 "answer": true, 105 "justification": "The four conditions (CH, C0, C1, C2) function as an ablation: C0→C1 isolates procedural failures, CH→C2 measures AI contribution, C1→C2 measures human reasoning contribution." 106 }, 107 "multiple_metrics": { 108 "applies": true, 109 "answer": true, 110 "justification": "Four aggregated metrics are used: Overall Pass, Partial Pass, Completion Time, and Token Usage. Pass@1 and pass@10 are also reported for LLM conditions." 111 }, 112 "human_evaluation": { 113 "applies": true, 114 "answer": true, 115 "justification": "The entire study includes human evaluation: 45 participants solve tasks under different conditions. Additionally, two independent expert reviewers validated task instances (Appendix D), and results are validated through participant feedback and independent expert review." 116 }, 117 "held_out_test_set": { 118 "applies": true, 119 "answer": true, 120 "justification": "Hidden test cases are used for final scoring: 'Each task includes a set of visible unit tests to assist users during implementation. Final scoring, however, is determined by a comprehensive suite of hidden test cases executed on the backend' (Section 4.3)." 121 }, 122 "per_category_breakdown": { 123 "applies": true, 124 "answer": true, 125 "justification": "Results broken down by difficulty level (Easy/Medium/Hard in Table 2), by professional track (SDE/MLE/DS in Table 8), and by model in Tables 6-7." 126 }, 127 "failure_cases_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "GPT-4o's universal 0% pass rate is discussed. Template validation failures are reported (5 of 45 templates required revision or replacement). Appendix L provides a detailed case study of failure modes." 131 }, 132 "negative_results_reported": { 133 "applies": true, 134 "answer": true, 135 "justification": "GPT-4o achieves 0% pass rate across all conditions. Two templates were rejected during validation. The paper reports that minimal intervention (C1) provides only marginal improvement over fully autonomous (C0), showing the intervention protocol has limited impact." 136 } 137 }, 138 "claims_and_evidence": { 139 "abstract_claims_supported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Abstract claims (0.67% standalone LLM pass rate, 18.89% unaided human pass rate, 31.11% collaborative pass rate) are directly supported by Tables 1 and 2." 143 }, 144 "causal_claims_justified": { 145 "applies": true, 146 "answer": true, 147 "justification": "The paper claims 'human-AI collaboration significantly improves performance.' The within-subject design with full counterbalancing (Latin Square, randomized task sequences, 24-hour interval between sessions) provides adequate causal identification for this claim." 148 }, 149 "generalization_bounded": { 150 "applies": true, 151 "answer": true, 152 "justification": "Appendix A explicitly lists limitations: Python only, Copilot-supported models only, East Asian university students only. Section 5.1 acknowledges 'generalizability to industry professionals and other ethnic groups is limited.' The title 'Collaborative Coding' is reasonably bounded." 153 }, 154 "alternative_explanations_discussed": { 155 "applies": true, 156 "answer": true, 157 "justification": "The paper discusses order effects (Part 3 of post-test questionnaire, Figure 11), cognitive fatigue mitigation (24-hour interval), and learning effects. They analyze whether improvement is due to tool familiarity vs genuine collaboration (finding expert users are minimally influenced by order)." 158 }, 159 "proxy_outcome_distinction": { 160 "applies": true, 161 "answer": true, 162 "justification": "The paper carefully defines what it measures (pass rates, completion time, token usage) and frames these as measures of 'synergy' within the specific benchmark context. Section 4.5 explicitly defines metrics and their derivation. The claims match the granularity of measurements." 163 } 164 }, 165 "setup_transparency": { 166 "model_versions_specified": { 167 "applies": true, 168 "answer": false, 169 "justification": "Models are identified by marketing names only: 'Claude-Sonnet-4', 'GPT-4.1', 'GPT-4o', 'Claude-Sonnet-3.7', 'Gemini-2.5-Pro'. No snapshot dates or API versions are provided. The paper notes evaluations were conducted 'as of July 26, 2025' but does not specify exact model versions." 170 }, 171 "prompts_provided": { 172 "applies": true, 173 "answer": false, 174 "justification": "Appendix C provides example prompts for the task generation agent (GPT-4.1). Appendix F provides summarized task descriptions noting 'The task description below is a short summary of the original text.' The actual full task READMEs sent to Copilot during evaluation are not provided." 175 }, 176 "hyperparameters_reported": { 177 "applies": true, 178 "answer": true, 179 "justification": "Appendix C.1 reports GPT-4.1 hyperparameters: temperature=0.7, top_p=0.9, max_tokens=8192. For LLM evaluation: 'Copilot does not permit hyperparameter customization, all models are evaluated with default settings' (Section 5.1)." 180 }, 181 "scaffolding_described": { 182 "applies": false, 183 "answer": false, 184 "justification": "The paper evaluates models through GitHub Copilot as a black box. The authors cannot describe Copilot's internal scaffolding. NA per schema guidelines for evaluating third-party tools." 185 }, 186 "data_preprocessing_documented": { 187 "applies": true, 188 "answer": true, 189 "justification": "The task generation pipeline is documented in detail: template creation → two-stage validation (fidelity check + generalization test) → expert review (Appendix D). The agentic task system's four tools and their invocation sequence are described in Section 4.2." 190 } 191 }, 192 "limitations_and_scope": { 193 "limitations_section_present": { 194 "applies": true, 195 "answer": true, 196 "justification": "Appendix A 'Limitation & Future Work' provides substantive discussion of limitations across multiple dimensions." 197 }, 198 "threats_to_validity_specific": { 199 "applies": true, 200 "answer": true, 201 "justification": "Specific threats discussed: Python-only implementation, Copilot-constrained model selection (excluding DeepSeek, Llama, Qwen, o3, GPT-5), homogeneous participant demographics (East Asian students), and discrete pass/fail conversion impacting interpretability." 202 }, 203 "scope_boundaries_stated": { 204 "applies": true, 205 "answer": true, 206 "justification": "Appendix A explicitly lists what was NOT tested: other programming languages, models via direct APIs, industry developers, diverse ethnic backgrounds, granular scoring beyond pass/fail." 207 } 208 }, 209 "data_integrity": { 210 "raw_data_available": { 211 "applies": true, 212 "answer": false, 213 "justification": "No raw data (participant solutions, interaction logs, LLM outputs) is made available. Only aggregated results are reported." 214 }, 215 "data_collection_described": { 216 "applies": true, 217 "answer": true, 218 "justification": "Appendix J details the full experimental protocol: informed consent, pre-test questionnaire, task procedure (Steps 1-4 with figures), post-test questionnaire. Data collection mechanisms (shell script submission, backend API, operational logs) are described." 219 }, 220 "recruitment_methods_described": { 221 "applies": true, 222 "answer": true, 223 "justification": "Section 5.1: 'We recruit 45 participants through personal contacts and advertisements posted on university forums.' Appendix H.1 provides detailed selection criteria. Appendix I describes the verification process including GitHub profile and LinkedIn checks." 224 }, 225 "data_pipeline_documented": { 226 "applies": true, 227 "answer": true, 228 "justification": "The full pipeline is documented: template creation → validation (2-stage protocol, Appendix D) → task generation (Section 4.2) → evaluation (hidden test cases, auto-calibrated baselines) → aggregated metrics (Section 4.5). Template pass rates reported (88.9% first-attempt, Appendix D.3)." 229 } 230 }, 231 "conflicts_of_interest": { 232 "funding_disclosed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No funding source, acknowledgments section, or grant information appears anywhere in the paper." 236 }, 237 "affiliations_disclosed": { 238 "applies": true, 239 "answer": true, 240 "justification": "All author affiliations are listed on the first page: NYU Abu Dhabi, NTU, UIUC, Harvard, Zhejiang University, UESTC, Beijing University of Technology, Hong Kong Polytechnic University." 241 }, 242 "funder_independent_of_outcome": { 243 "applies": true, 244 "answer": false, 245 "justification": "No funding is disclosed, so independence cannot be assessed. The paper uses GitHub Copilot extensively but does not disclose any relationship with Microsoft/GitHub." 246 }, 247 "financial_interests_declared": { 248 "applies": true, 249 "answer": false, 250 "justification": "No competing interests or financial disclosure statement appears in the paper." 251 } 252 }, 253 "contamination": { 254 "training_cutoff_stated": { 255 "applies": true, 256 "answer": false, 257 "justification": "No training data cutoff dates are stated for any of the five models evaluated. The paper only notes evaluations were conducted 'as of July 26, 2025.'" 258 }, 259 "train_test_overlap_discussed": { 260 "applies": true, 261 "answer": false, 262 "justification": "The dynamic task generation inherently mitigates train/test overlap, but the paper never explicitly discusses this as a contamination mitigation. The 450 static task instances for LLMs are not assessed for potential training data overlap." 263 }, 264 "benchmark_contamination_addressed": { 265 "applies": true, 266 "answer": false, 267 "justification": "While the dynamic generation approach and the Equation 2 validation protocol ensure tasks are not trivially solvable, the paper never frames this as addressing benchmark contamination. No decontamination analysis is performed on the static 450-instance dataset." 268 } 269 }, 270 "human_studies": { 271 "pre_registered": { 272 "applies": true, 273 "answer": false, 274 "justification": "No pre-registration is mentioned. No link to OSF, AsPredicted, or any registry is provided." 275 }, 276 "irb_or_ethics_approval": { 277 "applies": true, 278 "answer": true, 279 "justification": "The informed consent form (Appendix J.1) states: 'this study has been reviewed and approved by the Institutional Review Board (IRB).'" 280 }, 281 "demographics_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Appendix H.2 reports demographics: 28 males, 17 females, ages 19-26 (mean 21.4), education levels (24 undergrad, 15 master's, 6 PhD), 84.4% daily LLM users, mean 1.47 internships. Figure 5 provides visual representations." 285 }, 286 "inclusion_exclusion_criteria": { 287 "applies": true, 288 "answer": true, 289 "justification": "Appendix H.1 lists detailed criteria: age ≥18, CS-related major, ≥2 years programming experience, VS Code proficiency, AI assistant usage ≥3 times/week, Python proficiency, English proficiency, track-relevant expertise." 290 }, 291 "randomization_described": { 292 "applies": true, 293 "answer": true, 294 "justification": "Section 5.1: 'Each participant's task sequence is randomly selected from all balanced permutations of the four conditions. A Latin Square design ensures that every problem appears equally across conditions and is completed by different participants.'" 295 }, 296 "blinding_described": { 297 "applies": false, 298 "answer": false, 299 "justification": "Blinding is not feasible in this design: participants necessarily know whether Copilot is enabled (CH) or not (C2), as the presence of the AI assistant is the experimental manipulation." 300 }, 301 "attrition_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "The paper reports recruiting and studying 45 participants but does not explicitly state whether any participants dropped out or were excluded after beginning the study." 305 } 306 }, 307 "cost_and_practicality": { 308 "inference_cost_reported": { 309 "applies": true, 310 "answer": true, 311 "justification": "Token usage is reported as a primary metric in Tables 2 and 8 (e.g., C2 uses 2.04-2.31M tokens across difficulty levels). Completion times are also reported." 312 }, 313 "compute_budget_stated": { 314 "applies": true, 315 "answer": false, 316 "justification": "No total compute budget is stated: no API costs, no total token expenditure across all experiments, no hardware specifications for the evaluation server or Docker environments." 317 } 318 }, 319 "experimental_rigor": { 320 "seed_sensitivity_reported": { 321 "applies": true, 322 "answer": false, 323 "justification": "Results are reported from single runs (pass@1) and 10 attempts (pass@10) but with no analysis of seed sensitivity or result variation across repeated evaluations." 324 }, 325 "number_of_runs_stated": { 326 "applies": true, 327 "answer": true, 328 "justification": "The paper states 450 task instances (10 per template) for LLM evaluation. pass@1 and pass@10 are reported, implying the number of attempts. C0 allows 'up to 25 attempts' via Copilot's retry mechanism." 329 }, 330 "hyperparameter_search_budget": { 331 "applies": true, 332 "answer": true, 333 "justification": "The paper explicitly states 'Copilot does not permit hyperparameter customization, all models are evaluated with default settings' (Section 5.1). No hyperparameter search was conducted, and this is justified." 334 }, 335 "best_config_selection_justified": { 336 "applies": true, 337 "answer": true, 338 "justification": "All models use default settings with no configuration selection. The baseline model (Claude-Sonnet-4) is justified by 'market prevalence' (citing Menlo Ventures 2025)." 339 }, 340 "multiple_comparison_correction": { 341 "applies": true, 342 "answer": false, 343 "justification": "Five models are compared across multiple conditions, difficulty levels, and tracks (Tables 1, 6, 7). The paper states 'Statistical comparisons use appropriate tests' but provides no details on multiple comparison correction." 344 }, 345 "self_comparison_bias_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "The authors designed the benchmark and evaluate models on it. No discussion of potential bias from evaluating on their own benchmark or whether independent evaluation would yield different results." 349 }, 350 "compute_budget_vs_performance": { 351 "applies": true, 352 "answer": false, 353 "justification": "Token usage varies substantially across conditions (C0: 0.42-0.58M vs C2: 2.04-2.31M) but the relationship between compute budget and performance is not analyzed." 354 }, 355 "benchmark_construct_validity": { 356 "applies": true, 357 "answer": true, 358 "justification": "Sections 3 and 4 extensively discuss what the benchmark measures (ecological validity, collaboration-necessary tasks) grounded in distributed cognition theory. Participant feedback validates task realism (mean 4.07/5, Table 3). The paper discusses limitations of what it captures vs real-world development." 359 }, 360 "scaffold_confound_addressed": { 361 "applies": false, 362 "answer": false, 363 "justification": "The paper evaluates Copilot as a bundled product — the scaffold IS the thing being tested. All models are evaluated through the same Copilot interface, so scaffold is controlled within the study. NA per schema." 364 } 365 }, 366 "data_leakage": { 367 "temporal_leakage_addressed": { 368 "applies": true, 369 "answer": false, 370 "justification": "The benchmark uses newly generated tasks, which inherently mitigates temporal leakage. However, the paper never explicitly discusses temporal leakage as a concern or explains that dynamic generation serves this purpose." 371 }, 372 "feature_leakage_addressed": { 373 "applies": true, 374 "answer": false, 375 "justification": "The evaluation provides visible test cases during solving and hidden test cases for scoring. The paper does not discuss whether visible test cases could leak information about the hidden test cases or final evaluation criteria." 376 }, 377 "non_independence_addressed": { 378 "applies": true, 379 "answer": false, 380 "justification": "The 450 static instances are generated from 45 templates (10 per template). Instances from the same template share structural similarities. The paper does not discuss whether this non-independence affects aggregate results." 381 }, 382 "leakage_detection_method": { 383 "applies": true, 384 "answer": false, 385 "justification": "No explicit leakage detection method is applied. The template validation protocol (Equation 2) ensures tasks are hard but does not specifically test for data leakage." 386 } 387 } 388 }, 389 "red_flags": [ 390 { 391 "flag": "Statistical significance claimed without evidence", 392 "detail": "The paper claims results are 'statistically significant' (Section 5.2) and mentions 'appropriate tests' (Section 5.1) but never reports which tests were used, p-values, or test statistics for any comparison." 393 }, 394 { 395 "flag": "Homogeneous participant sample", 396 "detail": "All 45 participants are East Asian university students/recent graduates aged 19-26. The paper acknowledges this but still draws broad conclusions about 'human-AI synergy' and 'developer competencies in the AI era.' The recruitment via 'personal contacts and university forums' introduces selection bias." 397 }, 398 { 399 "flag": "No artifacts released despite claims", 400 "detail": "The paper uses present tense ('we release a reproducible toolkit') and future tense ('will be openly accessible') for artifact release but provides no URLs, repository links, or DOIs for any code, data, or benchmark materials." 401 }, 402 { 403 "flag": "Circular benchmark validation", 404 "detail": "The benchmark is designed to be hard for LLMs (Equation 2 requires <5% pass rate). Templates that LLMs could solve were rejected. Then the paper's main finding is that LLMs perform poorly on the benchmark. The 'collaboration-necessary' design creates a ceiling that ensures the desired result." 405 }, 406 { 407 "flag": "Co-reasoning claims based on self-report", 408 "detail": "The central qualitative finding — that LLMs serve as 'co-reasoning partners' — is based primarily on self-reported participant feedback and a single case study (Appendix L). Behavioral log analysis is mentioned but results are not systematically reported." 409 } 410 ], 411 "cited_papers": [ 412 { 413 "title": "SWE-bench: Can language models solve real-world software engineering problems?", 414 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"], 415 "year": 2024, 416 "relevance": "Key benchmark for evaluating LLM coding capabilities on real-world software engineering tasks, directly compared against HAI-Eval's approach." 417 }, 418 { 419 "title": "Evaluating large language models trained on code", 420 "authors": ["Mark Chen"], 421 "year": 2021, 422 "arxiv_id": "2107.03374", 423 "relevance": "Introduces HumanEval benchmark for code generation, foundational benchmark that HAI-Eval positions against." 424 }, 425 { 426 "title": "The effects of generative ai on high-skilled work: Evidence from three field experiments with software developers", 427 "authors": ["Zheyuan Kevin Cui", "Mert Demirer", "Sonia Jaffe", "Leon Musolff", "Sida Peng", "Tobias Salz"], 428 "year": 2025, 429 "relevance": "Enterprise RCT measuring AI impact on developer productivity, directly relevant to user study methodology comparison." 430 }, 431 { 432 "title": "Measuring the impact of early-2025 ai on experienced open-source developer productivity", 433 "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"], 434 "year": 2025, 435 "arxiv_id": "2507.09089", 436 "relevance": "Controlled study finding negative productivity impacts from AI tools, providing counterpoint to productivity gain claims." 437 }, 438 { 439 "title": "Experience with github copilot for developer productivity at zoominfo", 440 "authors": ["Gal Bakal", "Ali Dasdan", "Yaniv Katz", "Michael Kaufman", "Guy Levin"], 441 "year": 2025, 442 "arxiv_id": "2501.13282", 443 "relevance": "Enterprise study of Copilot productivity impact, relevant to AI-assisted coding evaluation methodology." 444 }, 445 { 446 "title": "How much does ai impact development speed? an enterprise-based randomized controlled trial", 447 "authors": ["Elise Paradis", "Kate Grey", "Quinn Madison"], 448 "year": 2025, 449 "relevance": "Enterprise RCT on AI impact on development speed, directly relevant comparison for human-AI collaboration study design." 450 }, 451 { 452 "title": "Reading between the lines: Modeling user behavior and costs in ai-assisted programming", 453 "authors": ["Hussein Mozannar", "Gagan Bansal", "Adam Fourney", "Eric Horvitz"], 454 "year": 2024, 455 "relevance": "Framework for modeling human behavior in AI-assisted programming, relevant to understanding developer-AI interaction patterns." 456 }, 457 { 458 "title": "Agent-as-a-judge: Evaluate agents with agents", 459 "authors": ["Mingchen Zhuge"], 460 "year": 2024, 461 "arxiv_id": "2410.10934", 462 "relevance": "Agent-based evaluation methodology, relevant to evolving approaches for evaluating AI coding systems." 463 }, 464 { 465 "title": "Collaborative gym: A framework for enabling and evaluating human-agent collaboration", 466 "authors": ["Yijia Shao", "Vinay Samuel", "Yucheng Jiang", "John Yang", "Diyi Yang"], 467 "year": 2024, 468 "arxiv_id": "2412.15701", 469 "relevance": "Framework for evaluating human-agent collaboration, directly relevant to HAI-Eval's approach of measuring collaboration value." 470 }, 471 { 472 "title": "MLE-Bench: A comprehensive benchmark for evaluating large language models in machine learning engineering tasks", 473 "authors": ["Dong He"], 474 "year": 2024, 475 "arxiv_id": "2405.16672", 476 "relevance": "Benchmark for ML engineering tasks that HAI-Eval extends beyond by requiring human-AI collaboration." 477 }, 478 { 479 "title": "LiveCodeBench: A comprehensive benchmark for general-purpose language agents", 480 "authors": ["Zihan Zheng"], 481 "year": 2024, 482 "arxiv_id": "2406.01869", 483 "relevance": "Evolving code benchmark focused on algorithmic complexity, positioned as insufficient for measuring collaboration." 484 }, 485 { 486 "title": "Sea change in software development: Economic and productivity analysis of the ai-powered developer lifecycle", 487 "authors": ["Thomas Dohmke", "Marco Iansiti", "Greg Richards"], 488 "year": 2023, 489 "arxiv_id": "2306.15033", 490 "relevance": "Productivity analysis of AI-powered development, relevant to understanding the economic impact HAI-Eval aims to measure." 491 } 492 ] 493 }