scan.json (27376B)
1 { 2 "paper": { 3 "title": "GitTaskBench: A Benchmark for Code Agents Solving Real-World Tasks Through Code Repository Leveraging", 4 "authors": ["Ziyi Ni", "Huacan Wang", "Shuo Zhang", "Shuo Lu", "Ziyang He", "Wang You", "Zhenheng Tang", "Yuntao Du", "Bill Sun", "Hongzhang Liu", "Sen Hu", "Ronghao Chen", "Bo Li", "Xin Li", "Chen Hu", "Binxing Jiao", "Daxin Jiang", "Pin Lyu"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2508.18993" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "GitTaskBench evaluates code agents on 54 real-world tasks across 7 domains requiring repository leveraging. The best system (OpenHands+Claude 3.7) solves only 48.15% of tasks, with environment setup errors accounting for 65% of failures. The paper introduces an alpha-value metric integrating task success, cost, and human labor rates to assess economic viability. Open-source models significantly underperform closed ones, and agents excel on textual tasks versus multimodal ones.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "GitHub repository URL provided: https://github.com/QuantaAlpha/GitTaskBench, and a project page at https://gittaskbench.github.io/." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The benchmark tasks, evaluation scripts, and repositories are released via the GitHub repository. 'All tasks and their detailed success criteria are openly available in the official GitHub repository.'" 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": true, 28 "justification": "Execution configurations are detailed in Appendix B: specific Docker containers for OpenHands and SWE-Agent, Python 3.12 Conda for Aider, and hardware specs (9 vCPUs, 33 GB RAM, 2965 GiB SSD). Framework versions pinned in Table 8." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": true, 33 "justification": "The paper states 'all tasks can be automatically assessed with a single shell command' and provides detailed framework configurations. The benchmark is open-sourced with evaluation scripts." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "No confidence intervals or error bars are reported. Results in Table 3 are point estimates only, despite being averaged over two runs." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "No statistical significance tests are used. Claims like 'OpenHands achieves the best overall performance' and comparisons between frameworks/models are based solely on comparing raw numbers." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "Performance differences are reported with baseline context, e.g., 'ECR 72.22%, TPR 48.15%' for the best system, with all other systems' scores shown for comparison. The alpha metric provides economic effect sizes." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "No justification for why 54 tasks were chosen. No power analysis or discussion of whether 54 tasks is sufficient for the claims made." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "The paper states 'all reported results are averaged over two independent runs' but does not report standard deviation, variance, or any spread measure across those runs." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Multiple agent frameworks (Aider, OpenHands, SWE-Agent) and multiple LLMs are compared against each other. Table 1 also compares GitTaskBench against prior benchmarks." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "Baselines include current state-of-the-art models: GPT-4.1, Claude 3.7, Gemini 2.5 Pro, DeepSeek V3, and Qwen3. Frameworks are pinned to April 2025 releases." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "Sensitivity analysis on timeout and max_iteration hyperparameters (Table 7) shows their impact on performance, functioning as an ablation of configuration choices." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Three main metrics: Execution Completion Rate (ECR), Task Pass Rate (TPR), and the alpha-value economic metric. Token usage and cost are also reported." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": true, 87 "justification": "Five raters independently compare each agent output with human-produced groundtruth and assign quality levels for the Q factor. 'Five raters independently compare each agent output with the groundtruth and assign it to one of five levels.'" 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": false, 92 "justification": "All 54 tasks are used for evaluation. There is no held-out test set or dev/test split. The benchmark is a single evaluation set." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Figure 5 provides domain-specific performance breakdowns across Image Processing, Video, Speech, Physiological Signals, Security, Web Scraping, and Office Document Processing." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Extensive error analysis in the Error Analysis section and Appendix F, with five error types classified and detailed case studies of failures (Listings 4-13)." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper reports that agents struggle with multimodal tasks, that replacing humans is not always cost-effective (negative alpha scores), and that environment setup causes 65% of failures. Gemini 2.5 Pro 'underwhelms in think mode.'" 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Abstract claims about 48.15% best TPR (Table 3), environment setup failures (Figure 8b, 65%), and the alpha metric are all supported by results. The updated 62.96% figure for RepoMaster+Claude 3.5 is mentioned in the abstract and leaderboard." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": false, 119 "justification": "The paper makes causal claims like 'likely due to its robust code execution capabilities and more proactive and explorative strategies' for OpenHands' superiority, without controlled experiments isolating these factors." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The paper claims to evaluate 'real-world tasks' and agent capabilities generally, but tests only 54 tasks across 18 repositories — all Python-based, with specific selection criteria (≥50 stars). The title and framing suggest broader applicability than the tested scope supports." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "No substantive discussion of alternative explanations for performance differences. For example, framework differences in resource allocation, prompt design, or runtime behavior could explain results, but these are not systematically analyzed." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper distinguishes between ECR (execution completion) and TPR (task pass), and introduces the alpha metric to bridge the gap between technical performance and practical economic value. The proxy/outcome relationships are explicit." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": true, 141 "justification": "Specific model versions are provided: 'GPT-4o-2024-08-06', 'Claude-3-5-sonnet-20241022', 'Claude-3-7-sonnet-20250219'. Table 8 lists exact framework versions (OpenHands 0.33.0, SWE-Agent v1.0.1-61-gaa4e8ea1, Aider v0.84.1.dev-21-gb2592267)." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "Full prompt templates are provided in Appendix C (Listings 1-2), including the core objective, workflow guidelines, and key constraints used for agent task execution." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "Temperature (0.5), top-p (1.0), max response length (4096 tokens), timeout (600s for OpenHands), and other framework-specific parameters are listed in Table 8 and Appendix B." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "The paper evaluates third-party agent frameworks (OpenHands, SWE-Agent, Aider) as black boxes. The authors cannot be expected to describe internal scaffolding they have no access to." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "The task and repository selection process is documented with criteria (Python-based, ≥50 stars, active in past 5 years), completeness verification procedures, and the four-step construction pipeline (Figure 4)." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Appendix G contains a 'Limitations & Future Work' section discussing scope limitations and planned expansions." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "The limitations in Appendix G are generic future work plans ('will further expand the benchmark', 'will broaden model coverage') rather than specific threats to validity of current results." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "The paper does not explicitly state what the results do NOT show. It acknowledges the benchmark 'focuses on practical, user-facing tasks' but does not bound the generalizability of its findings to the 54 Python-only tasks tested." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": true, 185 "justification": "The benchmark, evaluation scripts, and task definitions are open-sourced. Agent execution traces (events folders, batch_results.jsonl) appear to be available through the repository." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "The four-step curation pipeline is described in detail: task/repository selection with criteria, completeness verification by experts, execution framework design, and evaluation framework development (Figure 4)." 191 }, 192 "recruitment_methods_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Five PhD students in computer science performed completeness verification and human evaluation. Human completion times are reported (average 1.34 hours). However, evaluator selection criteria beyond 'PhD student' are not described." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The pipeline from repository selection to evaluation is documented with specific criteria at each stage (Figure 4), including selection criteria, verification steps, and evaluation metrics." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding information or acknowledgments section is present in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are listed: UCAS, CASIA, BUPT, NUS, StepFun, HKUST, SDU, PINAI, USYD, PKU, USTC. StepFun is a company, and several authors are affiliated with it." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "StepFun (an AI company) has authors on the paper and corresponding authors affiliated with it. StepFun has a potential interest in code agent benchmarking outcomes. No independence statement is provided." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement is present in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "No training data cutoff dates are stated for any of the evaluated models, despite using pre-trained LLMs on benchmark tasks." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "No discussion of whether the 18 GitHub repositories used as tasks could appear in any model's training data. Popular repositories with ≥50 stars are likely in training data." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "The benchmark tasks involve well-known open-source repositories (SpeechBrain, Scrapy, etc.) that are almost certainly in LLM training data. This contamination risk is not discussed." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in an experimental study. The five PhD raters are evaluators of agent outputs, not study participants." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants study; PhD students served as benchmark validators and output evaluators." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": true, 283 "justification": "API costs are reported per framework-model combination in Table 3 (e.g., $29.8 for OpenHands+Claude 3.7). Token pricing is detailed in Table 11." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": true, 288 "justification": "Hardware specs provided (9 vCPUs, 33 GB RAM, 2965 GiB SSD). API costs, token counts, and per-task costs are reported. The alpha metric explicitly incorporates operational costs." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "Results are averaged over two runs but no seed sensitivity analysis or variance across seeds is reported." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": true, 300 "justification": "'All reported results are averaged over two independent runs under identical settings.'" 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "The sensitivity analysis in Table 7 tests different timeout and max_iteration values, but no systematic hyperparameter search budget is reported. It's unclear how the default configurations were selected." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": true, 310 "justification": "The paper uses default framework settings except for explicitly stated changes (e.g., timeout=600s for OpenHands). Configuration choices are documented and justified in Appendix B." 311 }, 312 "multiple_comparison_correction": { 313 "applies": true, 314 "answer": false, 315 "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied despite many pairwise comparisons across models and frameworks." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The authors propose the benchmark and evaluate systems on it without acknowledging potential bias in benchmark design favoring certain capabilities. RepoMaster (from the same research group based on author overlap) achieves the top score." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": true, 325 "justification": "Table 3 reports both performance (ECR/TPR) and cost ($) for every configuration. The alpha metric explicitly relates performance to cost. The sensitivity analysis shows timeout vs. performance tradeoffs." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": false, 330 "justification": "No discussion of whether the 54 tasks actually measure 'real-world' code agent capability. The tasks are curated by the authors with no external validation of representativeness." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": true, 334 "answer": true, 335 "justification": "The paper evaluates multiple frameworks (OpenHands, SWE-Agent, Aider) with the same LLMs, allowing readers to see scaffold effects. Table 3 shows the same model performing differently across frameworks." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "The repositories used (e.g., SpeechBrain, Scrapy, DeOldify) are well-established open-source projects. Models trained on GitHub data have likely seen these repositories' code, READMEs, and usage patterns. This is not discussed." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether providing repository READMEs and code in the prompt constitutes feature leakage — models may have memorized repository usage patterns from training data." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "Some tasks share the same repository (e.g., NeuroKit has 3 tasks, SpeechBrain has 5). No discussion of whether these non-independent tasks inflate performance estimates." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No leakage detection or prevention method is used despite evaluating models on well-known public repositories." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "OpenHands+Claude 3.7 achieves the best performance with 72.22% ECR and 48.15% TPR", 364 "evidence": "Table 3 shows comparative results across all framework-model combinations. OpenHands+Claude 3.7 has the highest ECR and TPR.", 365 "supported": "strong" 366 }, 367 { 368 "claim": "Environment setup errors account for 65% of all failures", 369 "evidence": "Figure 8(b) shows error distribution: E1 (Environment-Setup) at 65.04%, followed by E2 (Workflow Planning) at 11%, E4 (Runtime) at 10%, E3 (Repository Comprehension) at 9%, E5 (Instructions) at 5%.", 370 "supported": "strong" 371 }, 372 { 373 "claim": "Agents perform notably better on purely textual tasks compared to multimodal, model-based tasks", 374 "evidence": "Figure 5 shows domain-specific performance. Office document processing tasks have higher success rates than image/speech processing tasks. The paper explains this is because textual tasks typically require 'reading simple wrapper scripts that import the library API.'", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "Replacing humans with agents is not always cost-effective", 379 "evidence": "Alpha scores in Table 10 show negative values for several repositories (e.g., AnimeGANv3 at -11.134 for Claude 3.5, DeScratch at -3.967). Low-MV image tasks often produce negative alpha when agent cost exceeds $1-2.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "DeepSeek V3 delivers the highest overall benefit and best cost-performance ratio", 384 "evidence": "Table 10 and Figure 7 show DeepSeek V3's alpha scores. However, this is for the alpha metric only — DeepSeek V3 has lower TPR (26.85%) than GPT-4.1 (42.59%) or Claude 3.7 (48.15%) under OpenHands.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "More generous timeout and max_iteration settings significantly boost performance", 389 "evidence": "Table 7 shows ECR increasing from 18.52% to 50.00% as timeout increases from 120s to 1800s, and TPR from 33.33% to 37.04% as max_iteration increases from 30 to 100.", 390 "supported": "strong" 391 } 392 ], 393 "red_flags": [ 394 { 395 "flag": "Self-comparison bias", 396 "detail": "RepoMaster, which achieves the top score of 62.96% on the leaderboard (Figure 1), shares authors with this paper (Wang et al. 2025a lists Huacan Wang as first author). The benchmark may be designed in ways that favor the authors' own system." 397 }, 398 { 399 "flag": "Contamination risk unaddressed", 400 "detail": "The 18 repositories are well-known open-source projects (SpeechBrain 8k+ stars, Scrapy 53k+ stars) whose code, READMEs, and usage patterns are almost certainly in LLM training data. Models may be recalling solutions rather than reasoning about repositories." 401 }, 402 { 403 "flag": "Very small benchmark size", 404 "detail": "54 tasks across 18 repositories is a small benchmark. With only 1-5 tasks per repository, individual task success/failure has outsized impact on domain-level conclusions. No justification is given for this sample size." 405 }, 406 { 407 "flag": "Only two runs per configuration", 408 "detail": "Results are averaged over only two independent runs with no variance reported. Two runs is insufficient to estimate result stability, especially with stochastic LLM outputs at temperature 0.5." 409 }, 410 { 411 "flag": "No statistical tests despite comparative claims", 412 "detail": "The paper makes numerous comparative claims (e.g., framework X outperforms Y, model A is more cost-efficient than B) without any statistical testing. With 54 tasks and two runs, observed differences could easily be within noise." 413 }, 414 { 415 "flag": "Market value estimates are subjective", 416 "detail": "The alpha metric relies on 'estimated, prevailing market value' from freelance platforms (Fiverr, Upwork). These estimates (e.g., $5 for scratch detection, $150 for video action analysis) are highly variable and could be selected to favor certain conclusions." 417 } 418 ], 419 "cited_papers": [ 420 { 421 "title": "Evaluating Large Language Models trained on Code", 422 "authors": ["Mark Chen"], 423 "year": 2021, 424 "arxiv_id": "2107.03374", 425 "relevance": "Introduced HumanEval, a foundational LLM code generation benchmark." 426 }, 427 { 428 "title": "Swe-bench: Can Language Models Resolve Real-World GitHub Issues?", 429 "authors": ["Carlos E. Jimenez"], 430 "year": 2023, 431 "arxiv_id": "2310.06770", 432 "relevance": "Major benchmark for evaluating LLMs on real-world software engineering tasks (GitHub issue resolution)." 433 }, 434 { 435 "title": "SWE-Lancer: Can Frontier LLMs Earn $1 Million from Real-World Freelance Software Engineering?", 436 "authors": ["Samuel Miserendino"], 437 "year": 2025, 438 "arxiv_id": "2502.12115", 439 "relevance": "Benchmark linking software engineering agent performance to real-world economic value." 440 }, 441 { 442 "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents", 443 "authors": ["Xingyao Wang"], 444 "year": 2025, 445 "arxiv_id": "2407.16741", 446 "relevance": "Major open-source agent framework evaluated in this benchmark." 447 }, 448 { 449 "title": "Swe-agent: Agent-computer interfaces enable automated software engineering", 450 "authors": ["John Yang"], 451 "year": 2024, 452 "relevance": "Key agent framework for automated software engineering, evaluated in the benchmark." 453 }, 454 { 455 "title": "MLE-Bench: Evaluating Machine Learning Agents on Machine Learning Engineering", 456 "authors": ["Jun Shern Chan"], 457 "year": 2025, 458 "arxiv_id": "2410.07095", 459 "relevance": "Benchmark for ML engineering agents with Kaggle competition tasks." 460 }, 461 { 462 "title": "PaperBench: Evaluating AI's Ability to Replicate AI Research", 463 "authors": ["Giulio Starace"], 464 "year": 2025, 465 "relevance": "Benchmark for evaluating AI agents on research paper replication tasks." 466 }, 467 { 468 "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions", 469 "authors": ["Terry Yue Zhuo"], 470 "year": 2024, 471 "arxiv_id": "2406.15877", 472 "relevance": "Code generation benchmark with diverse function calls and complex task instructions." 473 }, 474 { 475 "title": "MLAgentBench: Evaluating language Agents on Machine Learning Experimentation", 476 "authors": ["Qian Huang"], 477 "year": 2023, 478 "arxiv_id": "2310.03302", 479 "relevance": "Benchmark for evaluating LLM agents on ML experimentation tasks." 480 }, 481 { 482 "title": "ChatDev: Communicative Agents for Software Development", 483 "authors": ["Chen Qian"], 484 "year": 2023, 485 "relevance": "Multi-agent framework for software development using LLM agents." 486 }, 487 { 488 "title": "RepoMaster: Autonomous Exploration and Understanding of GitHub Repositories for Complex Task Solving", 489 "authors": ["Huacan Wang"], 490 "year": 2025, 491 "relevance": "Repository-aware agent framework achieving top GitTaskBench scores; shares authors with this paper." 492 }, 493 { 494 "title": "Livecodebench: Holistic and contamination free evaluation of large language models for code", 495 "authors": ["Naman Jain"], 496 "year": 2024, 497 "arxiv_id": "2403.07974", 498 "relevance": "Contamination-aware code evaluation benchmark." 499 } 500 ] 501 }