scan.json (31392B)
1 { 2 "paper": { 3 "title": "JavaBench: A Benchmark of Object-Oriented Code Generation for Evaluating Large Language Models", 4 "authors": [ 5 "Jialun Cao", 6 "Zhiyong Chen", 7 "Jiarong Wu", 8 "Shing-Chi Cheung", 9 "Chang Xu" 10 ], 11 "year": 2024, 12 "venue": "International Conference on Automated Software Engineering", 13 "arxiv_id": "2406.12902", 14 "doi": "10.1145/3691620.3695470" 15 }, 16 "checklist": { 17 "artifacts": { 18 "code_released": { 19 "applies": true, 20 "answer": true, 21 "justification": "The benchmark is publicly released at https://github.com/java-bench/JavaBench as stated in the abstract and Section 8: 'We released the implementation and all associated publicly available data.'" 22 }, 23 "data_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The benchmark data (projects, test suites, skeletons) is released via the same GitHub repository. Section 8 confirms 'all associated publicly available data' are released." 27 }, 28 "environment_specified": { 29 "applies": true, 30 "answer": false, 31 "justification": "The paper mentions hardware ('two NVIDIA RTX 6000 Ada GPUs, each with 48GB') and specific model IDs, but provides no requirements.txt, Dockerfile, or detailed software dependency versions needed to recreate the evaluation environment." 32 }, 33 "reproduction_instructions": { 34 "applies": true, 35 "answer": false, 36 "justification": "The paper provides a GitHub link and describes the methodology, but does not include step-by-step reproduction instructions, scripts to replicate experiments, or a 'Reproducing Results' section." 37 } 38 }, 39 "statistical_methodology": { 40 "confidence_intervals_or_error_bars": { 41 "applies": true, 42 "answer": false, 43 "justification": "All results in Tables 5 and 6 are reported as point estimates (e.g., '72.33% Compilation@1') with no confidence intervals, error bars, or ± notation." 44 }, 45 "significance_tests": { 46 "applies": true, 47 "answer": false, 48 "justification": "The paper makes numerous comparative claims ('holistic synthesis was generally better,' 'selected context yield the best overall results') based solely on comparing raw numbers without any significance tests." 49 }, 50 "effect_sizes_reported": { 51 "applies": true, 52 "answer": true, 53 "justification": "The paper reports effect sizes with baseline context throughout, e.g., 'WizardCoder dropped 51.43% (80.00% - 28.57%)' and '23.62% (= 34.34%-10.72%) drop.' Raw values and differences are consistently provided." 54 }, 55 "sample_size_justified": { 56 "applies": true, 57 "answer": false, 58 "justification": "The sample size of n=5 generated solutions is acknowledged only as 'Considering the cost and time, we set n to 5 and k to 1 and 5, following the previous study [28].' No power analysis or adequacy justification is provided." 59 }, 60 "variance_reported": { 61 "applies": true, 62 "answer": false, 63 "justification": "Despite generating 5 samples per configuration, no standard deviations or variance measures are reported across runs. Pass@k is computed as an expectation using the unbiased estimator, but no spread measures are provided." 64 } 65 }, 66 "evaluation_design": { 67 "baselines_included": { 68 "applies": true, 69 "answer": true, 70 "justification": "Five LLMs are compared against each other (Table 5) and against a human baseline of 282 undergraduate students (90.93% average, Table 2). The paper also compares synthesis strategies and context settings." 71 }, 72 "baselines_contemporary": { 73 "applies": true, 74 "answer": false, 75 "justification": "The studied LLMs (WizardCoder-15B, DeepSeek-6.7b/33b, Phind-34B, gpt-3.5-turbo) were released in 2022-2023. For a 2024 ASE paper, notable omissions include GPT-4, Claude, and other frontier models. The 34B parameter cap due to hardware limitations is acknowledged but not mitigated." 76 }, 77 "ablation_study": { 78 "applies": true, 79 "answer": true, 80 "justification": "RQ2 ablates context settings (maximum/minimum/selected) and RQ1/RQ3 compare synthesis strategies (holistic/independent/incremental with sequential/reverse/random orders), measuring the impact of each factor." 81 }, 82 "multiple_metrics": { 83 "applies": true, 84 "answer": true, 85 "justification": "Three progressive metrics are used: Completion@k, Compilation@k, and Pass@k, each at two granularities (class-wise and test-wise), as defined in Section 3.2.2." 86 }, 87 "human_evaluation": { 88 "applies": true, 89 "answer": false, 90 "justification": "The 282 students validated the benchmark by completing the projects (Section 2.2.3), not by evaluating LLM outputs. The bad case analysis in RQ4 involves manual inspection of failures but is not systematic human evaluation of generated code." 91 }, 92 "held_out_test_set": { 93 "applies": true, 94 "answer": true, 95 "justification": "The benchmark is used as a fixed test set. Models are evaluated without any fine-tuning or selection decisions on the benchmark data. The benchmark was kept confidential and not used in model training." 96 }, 97 "per_category_breakdown": { 98 "applies": true, 99 "answer": true, 100 "justification": "Results are broken down by project (P1-P4), by model, by synthesis strategy, by context setting, and by evaluation granularity (class-wise vs. test-wise) in Tables 5 and 6." 101 }, 102 "failure_cases_discussed": { 103 "applies": true, 104 "answer": true, 105 "justification": "RQ4 (Section 4.4) provides detailed bad case analysis with five specific failure examples (Listings 3-7): inheritance errors, encapsulation violations, illegal inheritance, documentation non-following, and trivial implementations." 106 }, 107 "negative_results_reported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The paper reports that no project can be correctly completed at project-wise granularity (all-zero Pass@k under all settings). It also reports reversed order in incremental synthesis does not help (Finding 7)." 111 } 112 }, 113 "claims_and_evidence": { 114 "abstract_claims_supported": { 115 "applies": true, 116 "answer": true, 117 "justification": "Abstract claims are supported: 'no project can be correctly completed' (confirmed by all-zero project-wise Pass@k), LLMs far behind students (48.24% best avg test-wise Pass@5 vs 90.93% human), method signature as optimal context (Table 6 confirms selected context best). The '41.17% Pass@5' in the abstract aligns approximately with Section 4.1's '41.7%' for a specific setting." 118 }, 119 "causal_claims_justified": { 120 "applies": true, 121 "answer": true, 122 "justification": "Causal claims like 'holistic synthesis yielded better performance' and 'providing too much or too little context has a negative impact' are supported by controlled single-variable manipulation: varying synthesis strategy while fixing context (RQ1), and varying context while fixing strategy (RQ2)." 123 }, 124 "generalization_bounded": { 125 "applies": true, 126 "answer": false, 127 "justification": "The abstract states 'LLMs are far behind undergraduate students' broadly, and findings like 'LLMs' capability to handle OOP features is far behind that of undergraduates' generalize beyond the 5 tested models and 4 student game projects. The threats section notes model generalization but the claims remain broad." 128 }, 129 "alternative_explanations_discussed": { 130 "applies": true, 131 "answer": true, 132 "justification": "Section 5 discusses prompt engineering as an alternative explanation ('a well-designed prompt could yield better performance'). Section 4.1 discusses that differences from ClassEval could be due to 'different programming languages (Python VS. Java) and code granularities (Class-level VS. Project-level),' offering specific alternative explanations for observed differences." 133 }, 134 "proxy_outcome_distinction": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper measures Pass@k on 4 entry-level course game projects and frames this as measuring 'LLMs' capability to handle OOP features' and 'project-level Java programming ability.' The gap between pass rates on 4 specific student assignments and general Java OOP capability is not acknowledged." 138 } 139 }, 140 "setup_transparency": { 141 "model_versions_specified": { 142 "applies": true, 143 "answer": true, 144 "justification": "Table 4 specifies exact model identifiers: 'WizardCoder-15B-V1.0', 'deepseek-coder-6.7b-instruct', 'deepseek-coder-33b-instruct', 'Phind-CodeLlama-34B-v2', 'gpt-3.5-turbo-1106' with sizes and release dates." 145 }, 146 "prompts_provided": { 147 "applies": true, 148 "answer": true, 149 "justification": "Listing 2 provides the full prompt template with system message and instruction text. The placeholder fill values (context and class code) come from the released benchmark and are thoroughly explained with examples in Section 3.1.1 and Figure 2." 150 }, 151 "hyperparameters_reported": { 152 "applies": true, 153 "answer": true, 154 "justification": "Section 4 states 'nucleus sampling... with a temperature of 0.2' and 'five solution samples were randomly generated.' Context truncation to 8192 characters is also specified (Section 3.1.1)." 155 }, 156 "scaffolding_described": { 157 "applies": false, 158 "answer": false, 159 "justification": "No agentic scaffolding is used. The evaluation involves direct prompting of LLMs to complete TODO methods, with no tools, feedback loops, or agent workflows." 160 }, 161 "data_preprocessing_documented": { 162 "applies": true, 163 "answer": true, 164 "justification": "Section 2 documents benchmark construction (project selection, canonical solutions, test construction). Section 3.1.1 describes context extraction using jdeps for dependency analysis. The truncation to 8192 characters is specified with statistics (53.3% contexts truncated)." 165 } 166 }, 167 "limitations_and_scope": { 168 "limitations_section_present": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 5 'Threats to Validity' provides a dedicated discussion of limitations including benchmark construction, model generalizability, prompt engineering, and data contamination." 172 }, 173 "threats_to_validity_specific": { 174 "applies": true, 175 "answer": true, 176 "justification": "The threats are specific to this study: 'we only studied five LLMs due to time and hardware limits,' 'the projects in JavaBench were kept confidential,' and quality of natural language descriptions could affect results. These go beyond generic disclaimers." 177 }, 178 "scope_boundaries_stated": { 179 "applies": true, 180 "answer": false, 181 "justification": "The paper notes 'the conclusion may not be able to generalize to other LLMs' but does not explicitly state what the results do NOT show, such as limitations to entry-level game projects, or inapplicability to industrial Java development. The boundaries are stated generically." 182 } 183 }, 184 "data_integrity": { 185 "raw_data_available": { 186 "applies": true, 187 "answer": true, 188 "justification": "Section 8 states 'We released the implementation and all associated publicly available data at https://github.com/java-bench/JavaBench.' The benchmark projects, test suites, and canonical solutions are available for verification." 189 }, 190 "data_collection_described": { 191 "applies": true, 192 "answer": true, 193 "justification": "Section 2 describes benchmark construction in detail: projects from an entry-level Java course, canonical solutions by experienced programmers with 5+ years of experience and cross-validation, test suites manually constructed with coverage metrics reported." 194 }, 195 "recruitment_methods_described": { 196 "applies": true, 197 "answer": false, 198 "justification": "The 282 students are described as 'undergraduate students' in an 'entry-level Java course' across '2019 to 2022,' but no details about how the course was selected, whether these students are representative, or potential selection biases are provided." 199 }, 200 "data_pipeline_documented": { 201 "applies": true, 202 "answer": true, 203 "justification": "The full pipeline is documented: project design (Section 2.2.1) → canonical solution construction (cross-validated) → test suite construction with coverage metrics (Section 2.2.2) → student validation (Section 2.2.3) → LLM evaluation pipeline (Section 3). Student exclusion criteria are documented in footnote 1." 204 } 205 }, 206 "conflicts_of_interest": { 207 "funding_disclosed": { 208 "applies": true, 209 "answer": false, 210 "justification": "No funding or acknowledgments section is present in the provided paper text. The authors are from HKUST and Nanjing University but no funding sources are disclosed." 211 }, 212 "affiliations_disclosed": { 213 "applies": true, 214 "answer": true, 215 "justification": "Author affiliations are clearly listed: HKUST Department of Computer Science and Engineering, Guangzhou HKUST Fok Ying Tung Research Institute, and State Key Laboratory for Novel Software Technology at Nanjing University." 216 }, 217 "funder_independent_of_outcome": { 218 "applies": true, 219 "answer": false, 220 "justification": "No funding is disclosed, so independence cannot be assessed. The authors are from academic institutions and evaluate third-party models, suggesting low conflict risk, but the absence of a funding disclosure statement itself is the issue." 221 }, 222 "financial_interests_declared": { 223 "applies": true, 224 "answer": false, 225 "justification": "No competing interests or financial interests statement is present in the paper." 226 } 227 }, 228 "contamination": { 229 "training_cutoff_stated": { 230 "applies": true, 231 "answer": false, 232 "justification": "Table 4 lists model release dates (e.g., June 2023, Nov 2022) but does not state training data cutoff dates. Release date is not equivalent to training cutoff." 233 }, 234 "train_test_overlap_discussed": { 235 "applies": true, 236 "answer": true, 237 "justification": "Section 5 explicitly discusses contamination: 'LLMs having seen the canonical code during training could lead to exaggerated scores, known as data contamination. However, the projects in JavaBench were kept confidential.' Section 2.2.1 notes students must keep assignments confidential." 238 }, 239 "benchmark_contamination_addressed": { 240 "applies": true, 241 "answer": true, 242 "justification": "The paper addresses contamination by arguing the benchmark projects were kept confidential (Section 2.2.1: 'Students are required to keep the course assignments and canonical solutions confidential for academic integrity, which reduces the data contamination threat')." 243 } 244 }, 245 "human_studies": { 246 "pre_registered": { 247 "applies": true, 248 "answer": false, 249 "justification": "No pre-registration is mentioned. The use of student data from a Java course was not pre-registered as a research study." 250 }, 251 "irb_or_ethics_approval": { 252 "applies": true, 253 "answer": false, 254 "justification": "No IRB or ethics board approval is mentioned despite using data from 282 undergraduate students." 255 }, 256 "demographics_reported": { 257 "applies": true, 258 "answer": false, 259 "justification": "Students are described only as 'undergraduate students' in an 'entry-level Java course' from 2019-2022. No information on gender, age, geographic distribution, or prior programming experience is reported." 260 }, 261 "inclusion_exclusion_criteria": { 262 "applies": true, 263 "answer": true, 264 "justification": "Footnote 1 states: 'we omit course withdrawals, non-submissions, and blank project submissions from the count because these cases do not attempt to complete the project.'" 265 }, 266 "randomization_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "Not applicable — this is not an experimental study with conditions. Students completed course assignments as part of their normal coursework; no randomization to conditions occurred." 270 }, 271 "blinding_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "Not applicable — students completed standard course assignments. There were no experimental conditions requiring blinding." 275 }, 276 "attrition_reported": { 277 "applies": true, 278 "answer": true, 279 "justification": "Footnote 1 explains that withdrawals, non-submissions, and blank submissions were excluded. Per-project participant counts are reported (62-79 students per project, 282 total)." 280 } 281 }, 282 "cost_and_practicality": { 283 "inference_cost_reported": { 284 "applies": true, 285 "answer": false, 286 "justification": "No API costs, tokens consumed, or per-example costs are reported. The paper mentions hardware but not the cost of running experiments across 5 models × multiple configurations × 5 samples." 287 }, 288 "compute_budget_stated": { 289 "applies": true, 290 "answer": false, 291 "justification": "The paper mentions 'two NVIDIA RTX 6000 Ada GPUs, each with 48GB' but does not report total GPU hours, wall-clock time, or total compute budget for the experiments." 292 } 293 }, 294 "experimental_rigor": { 295 "seed_sensitivity_reported": { 296 "applies": true, 297 "answer": false, 298 "justification": "The paper generates 5 samples per configuration using nucleus sampling but does not report sensitivity to random seeds or variation across different sampling runs." 299 }, 300 "number_of_runs_stated": { 301 "applies": true, 302 "answer": true, 303 "justification": "Section 4 states 'five solution samples were randomly generated with a temperature of 0.2.' The number of samples (n=5) and k values (1 and 5) are explicitly stated." 304 }, 305 "hyperparameter_search_budget": { 306 "applies": true, 307 "answer": false, 308 "justification": "Temperature 0.2 and context window 8192 tokens are used following prior work without reporting any search budget. No justification is provided for why these values were chosen beyond citing prior studies." 309 }, 310 "best_config_selection_justified": { 311 "applies": true, 312 "answer": true, 313 "justification": "The paper reports results for all tested configurations (3 context settings × 3-5 synthesis strategies × 5 models) in Tables 5 and 6, not just the best. The systematic exploration across configurations avoids cherry-picking." 314 }, 315 "multiple_comparison_correction": { 316 "applies": false, 317 "answer": false, 318 "justification": "No statistical tests are performed at all, so there are no tests to correct for multiple comparisons." 319 }, 320 "self_comparison_bias_addressed": { 321 "applies": true, 322 "answer": false, 323 "justification": "The authors evaluate models on their own benchmark without acknowledging potential bias from benchmark design choices that might favor or disfavor certain models." 324 }, 325 "compute_budget_vs_performance": { 326 "applies": true, 327 "answer": false, 328 "justification": "Models range from 6.7B to 34B parameters (plus GPT-3.5), but performance is not analyzed as a function of compute. The larger DeepSeek-33b does not always outperform smaller models, but this relationship is not explicitly analyzed." 329 }, 330 "benchmark_construct_validity": { 331 "applies": true, 332 "answer": true, 333 "justification": "The paper validates the benchmark through human performance (90.93% average from 282 students), test coverage metrics (92% class, 87% function, 86.75% line coverage), code complexity metrics, and systematic comparison with 24 existing benchmarks in Table 1." 334 }, 335 "scaffold_confound_addressed": { 336 "applies": false, 337 "answer": false, 338 "justification": "No scaffolding is used. All models are evaluated via direct prompting with the same prompt template and context settings." 339 } 340 }, 341 "data_leakage": { 342 "temporal_leakage_addressed": { 343 "applies": true, 344 "answer": true, 345 "justification": "The paper argues the benchmark was kept confidential through academic integrity requirements (Section 2.2.1), reducing the risk that models trained on internet data could have seen the benchmark solutions." 346 }, 347 "feature_leakage_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "No discussion of whether the evaluation setup (context settings, project descriptions, class skeletons) provides hints that would not be available in real usage scenarios." 351 }, 352 "non_independence_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "No discussion of whether the 4 projects (all game-themed, from the same course) share structural similarities that could affect the independence of performance measurements across projects." 356 }, 357 "leakage_detection_method": { 358 "applies": true, 359 "answer": false, 360 "justification": "No concrete leakage detection method is applied. The paper relies solely on the confidentiality argument without using canary strings, membership inference, or n-gram overlap analysis." 361 } 362 } 363 }, 364 "scan_version": 3, 365 "active_modules": ["experimental_rigor", "data_leakage"], 366 "claims": [ 367 { 368 "claim": "LLMs are far behind undergraduate students in project-level Java programming: no project can be correctly completed by any studied LLM, and the best average test-wise Pass@5 is 48.24% vs. 90.93% human average.", 369 "evidence": "Table 5 shows all-zero project-wise Pass@k under all settings. Section 4.1 reports 48.24% average test-wise Pass@5 for holistic strategy. Table 2 reports 90.93% average human score across 282 students.", 370 "supported": "strong" 371 }, 372 { 373 "claim": "Holistic synthesis yields better performance than independent and incremental synthesis strategies across all LLMs.", 374 "evidence": "Table 5 shows holistic averages of 91.73% Completion@1, 72.33% Compilation@1, 70.92% Pass@1 (class-wise) vs. independent 79.70%/62.89%/61.76% and incremental 75.84%/60.81%/59.76%. Finding 2, Section 4.1.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "Selected context (method signatures only) strikes the optimal balance, outperforming both maximum and minimum context settings.", 379 "evidence": "Table 6 shows selected context achieves 70.92% class-wise Pass@1 vs. 64.56% (maximum) and 37.47% (minimum). Test-wise: 27.40% vs. 26.55% and 3.79%. Finding 5-6, Section 4.2.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "Random and sequential order in incremental synthesis can yield up to 6% improvement over reversed order.", 384 "evidence": "Figure 5 shows for DeepSeek-Coder-6.7b at Pass@5, random order yields 86.07% vs. 80% for reversed. Finding 7, Section 4.3.", 385 "supported": "weak" 386 }, 387 { 388 "claim": "Finer-grained evaluation granularities (class-wise and test-wise) are necessary to capture nuanced performance differences that would be all-zero at project-wise granularity.", 389 "evidence": "Section 4.1 shows class-wise scores around 65% while project-wise Pass@k is all zeros. The gap between class-wise and test-wise is up to 49.92%. Finding 3.", 390 "supported": "strong" 391 } 392 ], 393 "methodology_tags": ["benchmark-eval"], 394 "key_findings": "JavaBench introduces a project-level Java benchmark with 4 OOP projects (389 methods, 106 classes, 92% test coverage) validated by 282 students averaging 90.93%. All five studied LLMs fail to complete any project correctly at project-wise granularity, with the best test-wise Pass@5 averaging 48.24% under holistic synthesis with selected context. Method signatures as context outperform both maximum and minimum context settings, and holistic synthesis outperforms independent and incremental strategies across all models.", 395 "red_flags": [ 396 { 397 "flag": "Very small benchmark scope", 398 "detail": "Only 4 projects, all entry-level game-themed course assignments from the same institution. This narrow domain may not represent real-world industrial Java development or diverse OOP patterns." 399 }, 400 { 401 "flag": "No statistical significance tests", 402 "detail": "All comparative claims across strategies, contexts, and models are based on raw numerical differences without any significance testing. With n=5 stochastic samples, observed differences could be due to sampling noise." 403 }, 404 { 405 "flag": "Outdated model selection", 406 "detail": "The 5 studied models (max 34B parameters, all from 2022-2023) exclude frontier models like GPT-4, Claude, or larger open-source models. Conclusions about 'LLMs' broadly are drawn from a limited and non-frontier model set." 407 }, 408 { 409 "flag": "Unfair human-LLM comparison", 410 "detail": "Students had weeks to complete assignments with access to course materials and could debug iteratively. LLMs had single-pass generation with fixed prompts. The comparison does not control for time, resources, or iterative refinement opportunities." 411 } 412 ], 413 "cited_papers": [ 414 { 415 "title": "Evaluating Large Language Models Trained on Code", 416 "authors": ["Mark Chen", "Jerry Tworek"], 417 "year": 2021, 418 "arxiv_id": "2107.03374", 419 "relevance": "Introduces HumanEval, the dominant code generation benchmark that JavaBench seeks to complement with project-level Java evaluation." 420 }, 421 { 422 "title": "ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-level Code Generation", 423 "authors": ["Xueying Du", "Mingwei Liu"], 424 "year": 2023, 425 "arxiv_id": "2308.01861", 426 "relevance": "Class-level code generation benchmark that JavaBench extends to project-level; synthesis strategies and evaluation design are adapted from this work." 427 }, 428 { 429 "title": "DevEval: A Manually-Annotated Code Generation Benchmark Aligned with Real-World Code Repositories", 430 "authors": ["Jia Li", "Ge Li"], 431 "year": 2024, 432 "arxiv_id": "2405.19856", 433 "relevance": "Project-level Python benchmark that JavaBench positions itself against as the first project-level Java benchmark." 434 }, 435 { 436 "title": "OOP: Object-Oriented Programming Evaluation Benchmark for Large Language Models", 437 "authors": ["Shuai Wang", "Liang Ding"], 438 "year": 2024, 439 "arxiv_id": "2401.06628", 440 "relevance": "Only prior benchmark claiming OOP evaluation, but JavaBench argues it lacks actual code context and only mentions OOP concepts in prompts." 441 }, 442 { 443 "title": "CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-trained Models", 444 "authors": ["Hao Yu", "Bo Shen"], 445 "year": 2023, 446 "arxiv_id": "2302.00288", 447 "relevance": "Function-level benchmark covering Python and Java code generation that JavaBench extends to project-level scope." 448 }, 449 { 450 "title": "Program Synthesis with Large Language Models", 451 "authors": ["Jacob Austin", "Augustus Odena"], 452 "year": 2021, 453 "arxiv_id": "2108.07732", 454 "relevance": "Introduces MBPP benchmark, one of the most widely used Python code generation benchmarks." 455 }, 456 { 457 "title": "RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation", 458 "authors": ["Fengji Zhang", "Bei Chen"], 459 "year": 2023, 460 "doi": "10.18653/v1/2023.emnlp-main.151", 461 "relevance": "Repository-level code completion technique using retrieval augmentation, relevant to JavaBench's context selection methodology." 462 }, 463 { 464 "title": "Concerned with Data Contamination? Assessing Countermeasures in Code Language Model", 465 "authors": ["Jialun Cao", "Wuqi Zhang"], 466 "year": 2024, 467 "arxiv_id": "2403.16898", 468 "relevance": "Studies data contamination in code LLMs, directly relevant to benchmark evaluation validity." 469 }, 470 { 471 "title": "An Empirical Evaluation of Using Large Language Models for Automated Unit Test Generation", 472 "authors": ["Max Schäfer", "Sarah Nadi"], 473 "year": 2024, 474 "doi": "10.1109/TSE.2023.3334955", 475 "relevance": "Empirical study of LLM-based test generation, relevant to the intersection of code generation evaluation and test quality." 476 }, 477 { 478 "title": "Lost in the middle: How language models use long contexts", 479 "authors": ["Nelson F Liu", "Kevin Lin"], 480 "year": 2024, 481 "relevance": "Explains LLMs ignoring mid-context information, cited to explain completion errors where models skip TODO markers in long contexts." 482 }, 483 { 484 "title": "Time Travel in LLMs: Tracing Data Contamination in Large Language Models", 485 "authors": ["Shahriar Golchin", "Mihai Surdeanu"], 486 "year": 2023, 487 "arxiv_id": "2308.08493", 488 "relevance": "Framework for detecting data contamination in LLMs, directly relevant to benchmark validity concerns." 489 }, 490 { 491 "title": "CodeGeeX: A Pre-Trained Model for Code Generation with Multilingual Benchmarking on HumanEval-X", 492 "authors": ["Qinkai Zheng", "Xiao Xia"], 493 "year": 2023, 494 "doi": "10.1145/3580305.3599790", 495 "relevance": "Introduces HumanEval-X multilingual benchmark, one of the few existing benchmarks including Java." 496 } 497 ], 498 "engagement_factors": { 499 "practical_relevance": { 500 "score": 2, 501 "justification": "Released benchmark with leaderboard that Java/OOP-focused teams could use to evaluate LLMs, but the 4-project scope limits practical adoption." 502 }, 503 "surprise_contrarian": { 504 "score": 1, 505 "justification": "LLMs struggling with project-level code is expected; the gap from student performance (48% vs 91%) quantifies it but doesn't challenge conventional wisdom." 506 }, 507 "fear_safety": { 508 "score": 0, 509 "justification": "No safety or security concerns raised; this is a capability evaluation benchmark." 510 }, 511 "drama_conflict": { 512 "score": 1, 513 "justification": "Mild drama in 'LLMs are far behind undergraduates' framing, but the result is unsurprising for project-level tasks." 514 }, 515 "demo_ability": { 516 "score": 2, 517 "justification": "Benchmark released on GitHub with a public leaderboard at java-bench.github.io, allowing model developers to test their systems." 518 }, 519 "brand_recognition": { 520 "score": 1, 521 "justification": "Evaluates ChatGPT (gpt-3.5-turbo) and DeepSeek but comes from academic labs, not a well-known AI company." 522 } 523 } 524 }