scan.json (19365B)
1 { 2 "paper": { 3 "title": "Copilot Evaluation Harness: Evaluating LLM-Guided Software Programming", 4 "authors": ["Anisha Agarwal", "Aaron Chan", "Shubham Chandel", "Jinu Jang", "Shaun Miller", "Roshanak Zilouchian Moghaddam", "Yevhen Mohylevskyy", "Neel Sundaresan", "Michele Tufano"], 5 "year": 2024, 6 "venue": "arXiv", 7 "arxiv_id": "2402.14261" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "Section 6 states 'Future work on this project involves... open-sourcing our data and evaluation code.' No code is released; it is promised for the future." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "Same as above — data is promised for future release but not currently available. No download links or repository URLs provided." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "Section 4.1 mentions Node 18+, Python 3.8+, Java JDK 1.8, .NET 6.0/7.0/8.0, and clang for C++, but these are build requirements for the evaluated repos, not a reproducible environment specification for the harness itself." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided. The methodology is described at a high level but there are no runnable scripts or README with commands." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Tables 1 and 2 report only point estimates (e.g., '83%', '74%') with no confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims GPT-4 'generally outperforms' GPT-3.5 and Code Llama based solely on comparing percentage numbers without any statistical significance tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": false, 46 "justification": "Raw percentages are reported but no effect sizes (Cohen's d, odds ratios, etc.) are provided. The prompt improvement is described as '5% in C++ to 11% in Java' but without baseline context in a formal effect size measure." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper mentions 'hundreds of test cases' and data from 'hundreds of public GitHub repositories' but never states exact sample sizes per language/scenario or justifies why those sizes are sufficient." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be single-run evaluations with no indication of variance across runs." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Three models (GPT-3.5, GPT-4, CodeLlama) are compared against each other across tasks, serving as mutual baselines. The paper also positions itself against HumanEval and BLEU-based evaluations." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "GPT-3.5, GPT-4, and Code Llama were all state-of-the-art or near-SOTA models at the time of writing (2024)." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": false, 73 "justification": "No ablation study is presented. The harness has multiple components (prompt design, context provision, parsing) but no systematic ablation of their individual contributions beyond the single prompt change for doc generation." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Each scenario uses multiple metrics: e.g., doc uses syntax correctness and format correctness; fix uses syntax correctness and fix rate; workspace uses MRR and end-to-end keyword detection." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation of system outputs is performed. All evaluation is automated (syntax checking, test execution, static analysis, keyword detection)." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": false, 88 "justification": "No mention of held-out test sets or train/test splits. The evaluation uses all collected test cases without separation." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down per language (Tables 1, 2) and per bug type (Table 3 for Typescript fix errors)." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Figures 7, 8, 10, 11 show concrete failure examples with analysis. Section 5.2 discusses error categories for doc generation (code logic changes, syntax changes, incomplete docstrings, irrelevant docstrings)." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports cases where GPT-4 fails while GPT-3.5 succeeds (Figures 8, 9, 10), and discusses GPT-4's more nuanced but failing approaches. C# bug fixing shows GPT-3.5 outperforming GPT-4." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims to introduce an evaluation harness with metrics for five scenarios and to provide learnings from evaluating three LLMs. The paper delivers on this, though only two of five scenarios have detailed results." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": false, 115 "justification": "Section 5.2.1 claims that adding a prompt instruction 'resulted in a significant improvement' (5-11%), which is a causal claim. However, there is no controlled experiment — confounds like different test cases or model stochasticity are not addressed." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title says 'Evaluating LLM-Guided Software Programming' broadly, but results are only presented for 2 of 5 scenarios (doc and fix). The paper does not sufficiently bound its claims to the tested scenarios." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper offers one speculative explanation for GPT vs CodeLlama differences (training corpus size, Section 5.1.1) but does not systematically discuss confounds like prompt sensitivity, temperature settings, or data selection bias." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper refers to 'GPT-3.5', 'GPT-4', and 'Code Llama' without specifying exact model versions, snapshot dates, or API versions." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Figures 6, 7, 8, 11 show the actual prompts sent to the models for fix scenarios. Figure 9 and 10 show doc prompts. These are concrete prompt texts, not just descriptions." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "No mention of temperature, top-p, max tokens, or other sampling parameters used for the LLM API calls." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "The paper evaluates VS Code's chat extension as a black box IDE integration. The authors cannot be expected to describe the internal scaffolding of the third-party tool." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 4.1 describes repository filtering criteria (size 1-100MB, build time <10 min, must contain methods) and per-language build requirements. Section 4.2 describes test case creation criteria for each scenario." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion mentions future work but does not discuss limitations." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No specific threats to validity are discussed anywhere in the paper." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper does not explicitly state what the results do NOT show. It acknowledges only presenting 2 of 5 scenarios but does not discuss the limitations of generalizing from those results." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "No raw data is available. Data release is promised as future work." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 4.1 describes data collection: public GitHub repos, filtered by language-specific build requirements, size constraints, and ability to build/test. Section 4.2 describes test case generation per scenario." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants in the main evaluation. The telemetry data from Microsoft developers (Section 5.3) is usage data, not a recruited study." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The pipeline from repository selection to test case creation to evaluation is described across Sections 4.1 and 4.2, with filtering criteria at each stage. However, exact counts of repos/methods at each stage are not provided." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding statement or acknowledgments section is present in the paper." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "All authors are listed as affiliated with Microsoft, Redmond, USA." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "All authors are Microsoft employees evaluating VS Code's Copilot integration. Microsoft has a direct financial interest in favorable results for its Copilot product." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present. The conflict between Microsoft authorship and Copilot evaluation is not explicitly acknowledged." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "No training data cutoff dates are stated for any of the three models evaluated (GPT-3.5, GPT-4, Code Llama)." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "The test cases come from public GitHub repositories. Given that GPT models are trained on public code, potential train/test overlap is a significant concern that is not discussed." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "The evaluation uses public GitHub code that likely existed before the models' training cutoffs. This contamination risk is never addressed." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in the study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in the study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in the study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in the study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in the study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in the study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No inference costs, API costs, or latency figures are reported despite evaluating three LLMs across hundreds of test cases." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No compute budget, GPU hours, or total API spend is mentioned." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "GPT-4 generally outperforms GPT-3.5 and Code Llama on documentation generation and bug fixing tasks.", 286 "evidence": "Tables 1 and 2 show GPT-4 achieving higher syntax and format correctness for doc and higher fix rates across most languages (Section 5.1).", 287 "supported": "moderate" 288 }, 289 { 290 "claim": "The evaluation harness metrics are more robust and information-dense than previous evaluation systems like HumanEval.", 291 "evidence": "Section 1 and 3 argue that HumanEval uses simple algorithmic problems while the harness uses real-world codebases with multiple metrics, but no empirical comparison of metric quality is provided.", 292 "supported": "weak" 293 }, 294 { 295 "claim": "Adding a prompt instruction not to change focal code resulted in 5-11% improvement in doc generation across languages.", 296 "evidence": "Section 5.2.1 states this improvement but provides no table, no per-language breakdown of the improvement, and no statistical testing.", 297 "supported": "weak" 298 }, 299 { 300 "claim": "The evaluation dataset is representative of real-world usage based on embedding similarity with Microsoft developer telemetry.", 301 "evidence": "Section 5.3 uses PCA visualization (Figures 12, 13) showing test cases and real usage overlap in embedding space, but no quantitative similarity metric is computed.", 302 "supported": "weak" 303 } 304 ], 305 "methodology_tags": ["benchmark-eval"], 306 "key_findings": "The paper introduces the Copilot Evaluation Harness with five evaluation scenarios for LLM-guided IDE programming, presenting detailed results for documentation generation and bug fixing across six languages. GPT-4 generally outperforms GPT-3.5 and Code Llama, though GPT-3.5 sometimes passes with simpler solutions where GPT-4 fails with more nuanced but incorrect approaches. The harness revealed actionable integration improvements, such as prompt modifications that improved doc generation by 5-11%.", 307 "red_flags": [ 308 { 309 "flag": "Company evaluating its own product", 310 "detail": "All nine authors are Microsoft employees evaluating Microsoft's VS Code Copilot integration. This conflict of interest is not disclosed or acknowledged in the paper." 311 }, 312 { 313 "flag": "No statistical rigor", 314 "detail": "All comparative claims are based on raw percentage comparisons with no confidence intervals, significance tests, variance reporting, or sample sizes. It is impossible to assess whether observed differences are meaningful." 315 }, 316 { 317 "flag": "Incomplete results", 318 "detail": "The paper defines five evaluation scenarios but only reports results for two (doc and fix). The abstract and title suggest a more comprehensive evaluation than is delivered." 319 }, 320 { 321 "flag": "Contamination risk unaddressed", 322 "detail": "Test cases come from public GitHub repositories that the evaluated models were likely trained on. This fundamental validity concern is never discussed." 323 }, 324 { 325 "flag": "No limitations section", 326 "detail": "The paper lacks any discussion of limitations, threats to validity, or scope boundaries." 327 } 328 ], 329 "cited_papers": [ 330 { 331 "title": "Evaluating large language models trained on code", 332 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 333 "year": 2021, 334 "relevance": "Introduces HumanEval benchmark, the primary baseline this paper positions against for code generation evaluation." 335 }, 336 { 337 "title": "Code llama: Open foundation models for code", 338 "authors": ["Baptiste Roziere"], 339 "year": 2023, 340 "arxiv_id": "2308.12950", 341 "relevance": "One of the three LLMs evaluated in the harness; represents open-source code-specific models." 342 }, 343 { 344 "title": "GPT-4 technical report", 345 "authors": ["OpenAI"], 346 "year": 2023, 347 "relevance": "One of the three LLMs evaluated; the strongest proprietary model tested." 348 }, 349 { 350 "title": "Large language models for software engineering: A systematic literature review", 351 "authors": ["Xinyi Hou", "Yanjie Zhao", "Yue Liu"], 352 "year": 2023, 353 "arxiv_id": "2308.10620", 354 "relevance": "Comprehensive SLR on LLMs for SE covering evaluation methods, relevant as a survey of the field this paper contributes to." 355 }, 356 { 357 "title": "CodeXGLUE: A machine learning benchmark dataset for code understanding and generation", 358 "authors": ["Shuai Lu", "Daya Guo", "Shuo Ren"], 359 "year": 2021, 360 "relevance": "Comprehensive evaluation platform for LLMs in SE tasks, a key related benchmark." 361 }, 362 { 363 "title": "A survey on evaluation of large language models", 364 "authors": ["Yupeng Chang", "Xu Wang"], 365 "year": 2023, 366 "relevance": "Surveys LLM evaluation methods broadly, providing context for evaluation methodology choices." 367 }, 368 { 369 "title": "Lost in the middle: How language models use long contexts", 370 "authors": ["Nelson F. Liu", "Kevin Lin", "John Hewitt"], 371 "year": 2023, 372 "relevance": "Relevant to context ordering in IDE prompts, cited as motivation for prompt design considerations." 373 }, 374 { 375 "title": "Chain-of-thought prompting elicits reasoning in large language models", 376 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 377 "year": 2023, 378 "relevance": "Foundational prompting technique cited as relevant to IDE integration prompt engineering." 379 } 380 ] 381 }