scan-v5.json (21410B)
1 { 2 "scan_version": 5, 3 "paper_type": "benchmark-creation", 4 "paper": { 5 "title": "JavaBench: A Benchmark of Object-Oriented Code Generation for Evaluating Large Language Models", 6 "authors": [ 7 "Jialun Cao", 8 "Zhiyong Chen", 9 "Jiarong Wu", 10 "S. Cheung", 11 "Chang Xu" 12 ], 13 "year": 2024, 14 "venue": "International Conference on Automated Software Engineering", 15 "arxiv_id": "2406.12902", 16 "doi": "10.1145/3691620.3695470" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": false, 23 "justification": "The abstract states 'at most 41.17% Pass@5 in a more relaxed evaluation' but Section 4.1 Finding 4 reports 'The best average test-wise Pass@5 in JavaBench is 48.24%' — a significant numerical inconsistency suggesting the abstract was not updated with final results.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Comparative claims (holistic synthesis outperforms independent/incremental; selected context outperforms maximum/minimum) are supported by controlled ablation experiments in RQ1-RQ3 across five LLMs with consistent results.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "Findings are presented broadly ('LLMs are far behind undergraduate students,' 'LLMs' capability to handle OOP features') based on only 5 models with no GPT-4 or frontier model coverage; the main findings text does not consistently bound claims to the studied models.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "The student comparison is not controlled for testing conditions (students work iteratively with full documentation, LLMs make a fixed number of attempts); no alternative explanations for the LLM-student gap are considered.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper explicitly distinguishes Completion@k (syntactic completion), Compilation@k (compilability), and Pass@k (semantic correctness via test suite), and characterizes test coverage at 87-92%, clearly separating what is measured from broader code quality.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section 5 'Threats to Validity' is a dedicated section discussing benchmark construction quality, LLM generalizability, prompt engineering variance, and data contamination.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": true, 61 "justification": "Specific threats are named: only 5 LLMs studied due to 'time and hardware limits,' documentation quality affecting generation, prompt engineering variance, and confidentiality as contamination mitigation — concrete enough to be more than boilerplate.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": false, 67 "justification": "The Threats section acknowledges the 5-LLM limitation but the main findings text still generalizes to 'LLMs' broadly without consistently stating which conclusions should not be extrapolated beyond the studied setting.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "There is no acknowledgments section, funding statement, or grant disclosure anywhere in the paper.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations (HKUST and Nanjing University) are clearly listed on the title page.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": false, 86 "answer": false, 87 "justification": "No funding is disclosed, so independence cannot be assessed.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests statement, financial interests declaration, or conflict-of-interest disclosure appears in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "OOP features (encapsulation, inheritance, polymorphism), context settings (maximum/minimum/selected), synthesis strategies (holistic/independent/incremental), and evaluation metrics (Completion/Compilation/Pass@k) are all explicitly defined.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "The Contributions section explicitly lists three contributions: significance (first project-level Java OOP benchmark), novelty (systematic evaluation design), and evaluation (extensive experiments with findings).", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Table 1 systematically compares 24 existing benchmarks across language, granularity, and scale; Section 6 discusses how JavaBench extends, differs from, and complements ClassEval, DevEval, RepoEval, and OOPEval.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "benchmark-creation": { 120 "construct_design": { 121 "construct_validity_argued": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper argues that OOP features (inheritance, polymorphism) inherently require multi-class project contexts that function-level benchmarks cannot assess, making project-level Java a necessary condition for the claimed measurement.", 125 "source": "haiku" 126 }, 127 "difficulty_distribution_characterized": { 128 "applies": true, 129 "answer": true, 130 "justification": "Table 3 reports cyclomatic and cognitive complexity per project; Table 2 shows human performance mean and standard deviation per project; however, there are only 4 projects with similar difficulty (90-95% human pass rate), providing limited range.", 131 "source": "haiku" 132 }, 133 "ceiling_floor_effects_checked": { 134 "applies": true, 135 "answer": true, 136 "justification": "The paper explicitly shows no ceiling effect (best LLM 48.24% test-wise Pass@5 vs 90.93% human) and notes project-wise evaluation yields all-zero results, indicating the benchmark is appropriately challenging without a floor problem at coarser granularity.", 137 "source": "haiku" 138 }, 139 "human_baseline_included": { 140 "applies": true, 141 "answer": true, 142 "justification": "282 undergraduate students completed the four projects over four academic years, achieving a 90.93/100 average pass rate; per-project breakdown with min/max/mean/std is provided in Table 2.", 143 "source": "haiku" 144 }, 145 "scoring_rubric_justified": { 146 "applies": true, 147 "answer": true, 148 "justification": "The paper explicitly argues execution-based Pass@k over similarity metrics (BLEU), defines the Pass@k formula, and justifies the class-wise and test-wise granularities as capturing nuanced partial success shadowed by project-wise all-zero evaluation.", 149 "source": "haiku" 150 } 151 }, 152 "robustness": { 153 "contamination_resistance_designed": { 154 "applies": true, 155 "answer": false, 156 "justification": "Contamination resistance relies entirely on academic confidentiality of student assignments — no canary strings, temporal splits, or dynamic generation; the paper acknowledges this as a threat but offers no technical countermeasure.", 157 "source": "haiku" 158 }, 159 "temporal_robustness_discussed": { 160 "applies": true, 161 "answer": false, 162 "justification": "The paper does not discuss how the benchmark will remain useful as LLMs improve, does not project a saturation timeline, and does not provide an update or versioning plan.", 163 "source": "haiku" 164 }, 165 "failure_modes_discussed": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 4.4 analyzes five categories of benchmark failure modes in generated code: completion errors, inheritance errors, encapsulation errors, illegal inheritance, documentation non-following, and trivial implementations — with concrete code examples.", 169 "source": "haiku" 170 }, 171 "baseline_implementations_provided": { 172 "applies": true, 173 "answer": true, 174 "justification": "Full implementation is publicly released at https://github.com/java-bench/JavaBench with a leaderboard at https://java-bench.github.io/leaderboard.html enabling reproduction of reported numbers.", 175 "source": "haiku" 176 } 177 }, 178 "documentation": { 179 "dataset_documentation_complete": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 2 documents benchmark format, project descriptions, exercised Java concepts, test construction methodology, code coverage metrics, and human performance; Table 3 provides complete code and test statistics per project.", 183 "source": "haiku" 184 }, 185 "licensing_and_access_clear": { 186 "applies": true, 187 "answer": false, 188 "justification": "The GitHub URL is provided and access is clearly open, but no license is stated in the paper, leaving the terms of reuse legally ambiguous.", 189 "source": "haiku" 190 }, 191 "intended_use_specified": { 192 "applies": true, 193 "answer": false, 194 "justification": "The paper describes what the benchmark measures (project-level Java OOP code generation) but does not specify what should NOT be concluded from benchmark results or warn against misuse cases.", 195 "source": "haiku" 196 } 197 } 198 } 199 }, 200 "claims": [ 201 { 202 "claim": "LLMs are far behind undergraduates on project-level Java OOP tasks: the best LLM achieves at most 48.24% test-wise Pass@5 vs 90.93% for undergraduates, and no LLM correctly completes any project in project-wise evaluation.", 203 "evidence": "Table 5 and Section 4.1 Finding 4 report all-zero project-wise Pass@5 across all 5 LLMs; 48.24% best average test-wise Pass@5 vs 90.93% student average in Table 2.", 204 "supported": "strong" 205 }, 206 { 207 "claim": "95.8% of existing code generation benchmarks target Python, and only 5 involve Java, all at function level.", 208 "evidence": "Table 1 surveys 24 benchmarks; 23/24 involve Python; only Concode, HumanEval-X, MBXP, MultiPL-MBPP, CoderEval involve Java at function level.", 209 "supported": "strong" 210 }, 211 { 212 "claim": "Holistic synthesis (generating all methods in a class in one pass) consistently outperforms independent and incremental synthesis strategies.", 213 "evidence": "Table 5 shows holistic achieves best Completion@1 (91.73%), Compilation@1 (72.33%), and Pass@1 (70.92%) averaged across all LLMs; consistent across all 5 models.", 214 "supported": "strong" 215 }, 216 { 217 "claim": "Selected context (providing only method signatures of dependent classes) strikes the optimal balance, outperforming both maximum and minimum context settings.", 218 "evidence": "Table 6 shows selected context achieves 70.92% class-wise Pass@1 vs 64.56% maximum and 37.47% minimum; minimum context produces near-zero test-wise pass rates.", 219 "supported": "strong" 220 }, 221 { 222 "claim": "Providing too much context (maximum) or too little context (minimum) both degrade project-level code generation performance.", 223 "evidence": "Table 6 shows minimum context yields near-zero test-wise Pass@1 across all models; maximum context improves some models but degrades others (e.g., DeepSeek-33b drops 23.62pp on P4).", 224 "supported": "moderate" 225 }, 226 { 227 "claim": "AssertionFailedError and IllegalArgumentException account for 76.63% of test-failing errors in LLM-generated code.", 228 "evidence": "Figure 6 exception distribution analysis reports AssertionFailedError at 50.75% and IllegalArgumentException at 25.88% of test failures.", 229 "supported": "strong" 230 } 231 ], 232 "methodology_tags": [ 233 "benchmark-eval", 234 "observational" 235 ], 236 "key_findings": "JavaBench demonstrates a substantial gap between LLM and human performance on project-level Java OOP code generation: the best LLM (DeepSeek-33b) achieves 48.24% test-wise Pass@5 versus 90.93% for undergraduates, with all LLMs scoring 0% on project-wise evaluation. Holistic synthesis (generating all methods in a class simultaneously) outperforms independent and incremental strategies across all five evaluated LLMs. The benchmark identifies three main error types in LLM-generated Java: completion failures, OOP-specific compilation errors (inheritance, encapsulation, polymorphism violations), and test failures from documentation non-following and trivial implementations. Selected context (method signatures of dependent types only) provides the optimal balance between information richness and input token efficiency.", 237 "red_flags": [ 238 { 239 "flag": "Abstract-body number discrepancy", 240 "detail": "The abstract states 'at most 41.17% Pass@5 in a more relaxed evaluation' but Section 4.1 Finding 4 reports 'The best average test-wise Pass@5 in JavaBench is 48.24%'. These numbers are inconsistent, suggesting the abstract was not updated with final experimental results." 241 }, 242 { 243 "flag": "Extremely small benchmark scale", 244 "detail": "JavaBench contains only 4 projects, making it difficult to draw statistically robust conclusions about LLM capabilities; performance differences between projects are substantial (e.g., P2 test-wise Pass@1 near 0% for multiple models)." 245 }, 246 { 247 "flag": "LLM coverage excludes frontier models", 248 "detail": "Only 5 LLMs are evaluated, with GPT-3.5 as the largest closed-source model; GPT-4, Claude, and Gemini are absent due to resource constraints, limiting the benchmark's ability to characterize the full capability range." 249 }, 250 { 251 "flag": "No technical contamination resistance", 252 "detail": "Contamination mitigation relies entirely on academic confidentiality of student assignments (2019-2022), with no canary strings, temporal splits, or dynamic generation — the confidentiality claim is unverifiable." 253 }, 254 { 255 "flag": "No funding disclosure", 256 "detail": "The paper includes no acknowledgments, funding statement, or grant information, making it impossible to assess potential conflicts of interest." 257 }, 258 { 259 "flag": "Benchmark license unspecified", 260 "detail": "The paper provides a GitHub URL for the benchmark but does not specify any license, leaving reuse terms legally ambiguous." 261 } 262 ], 263 "cited_papers": [ 264 { 265 "title": "HumanEval: Evaluating Large Language Models Trained on Code", 266 "relevance": "The primary baseline code generation benchmark that JavaBench is compared against and positioned to extend to project-level Java OOP." 267 }, 268 { 269 "title": "ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-level Code Generation", 270 "relevance": "The most directly comparable predecessor benchmark at class-level Python; JavaBench extends to project-level Java and adopts similar synthesis strategy designs." 271 }, 272 { 273 "title": "DevEval: A Manually-Annotated Code Generation Benchmark Aligned with Real-World Code Repositories", 274 "relevance": "Another project-level Python benchmark that JavaBench is compared against in Table 1 to establish the gap in Java project-level evaluation." 275 }, 276 { 277 "title": "OOP: Object-Oriented Programming Evaluation Benchmark for Large Language Models", 278 "relevance": "The only prior benchmark claiming to test OOP features, which JavaBench critiques for not providing actual code context — only OOP concepts in prompts." 279 }, 280 { 281 "title": "RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation", 282 "relevance": "A key RAG-based approach for project-level code completion that the selected context design partially complements." 283 }, 284 { 285 "title": "CrossCodeEval: A Diverse and Multilingual Benchmark for Cross-File Code Completion", 286 "relevance": "A multilingual benchmark including Java that JavaBench is distinguished from by providing project-level rather than statement-level evaluation." 287 }, 288 { 289 "title": "Time Travel in LLMs: Tracing Data Contamination in Large Language Models", 290 "relevance": "Referenced as the basis for contamination concerns that JavaBench attempts to mitigate through academic confidentiality." 291 }, 292 { 293 "title": "Lost in the Middle: How Language Models Use Long Contexts", 294 "relevance": "Supports the motivation for selective context design — finding that LLMs fail to use information in the middle of long contexts, relevant to JavaBench's context setting ablation." 295 } 296 ], 297 "engagement_factors": { 298 "practical_relevance": { 299 "score": 2, 300 "justification": "Engineers choosing LLMs for Java enterprise development and researchers designing prompting strategies can directly apply the context selection and synthesis strategy findings." 301 }, 302 "surprise_contrarian": { 303 "score": 2, 304 "justification": "The finding that method signatures only (not full context) outperforms providing maximum context is counterintuitive and practically actionable; the severity of the LLM-student gap on what seems like a student-grade assignment is striking." 305 }, 306 "fear_safety": { 307 "score": 0, 308 "justification": "No AI safety or risk concerns are raised; the paper focuses on capability evaluation with no threat modeling." 309 }, 310 "drama_conflict": { 311 "score": 1, 312 "justification": "The LLMs-vs-undergraduates framing has mild drama potential, but the domain (student Java assignments) limits headline appeal." 313 }, 314 "demo_ability": { 315 "score": 3, 316 "justification": "A public leaderboard (java-bench.github.io/leaderboard.html) and GitHub repository allow anyone to immediately test their model and compare results." 317 }, 318 "brand_recognition": { 319 "score": 1, 320 "justification": "HKUST and Nanjing University are reputable academic institutions but not AI-brand-name labs; no famous product or company association." 321 } 322 }, 323 "hn_data": { 324 "threads": [ 325 { 326 "hn_id": "39483482", 327 "title": "Show HN: OK-Robot: open, modular home robot framework for pick-and-drop anywhere", 328 "points": 542, 329 "comments": 110, 330 "url": "https://news.ycombinator.com/item?id=39483482" 331 }, 332 { 333 "hn_id": "36475563", 334 "title": "AudioPaLM: A large language model that can speak and listen", 335 "points": 69, 336 "comments": 11, 337 "url": "https://news.ycombinator.com/item?id=36475563" 338 }, 339 { 340 "hn_id": "40727755", 341 "title": "Adversarial Perturbations Cannot Reliably Protect Artists from Generative AI", 342 "points": 5, 343 "comments": 0, 344 "url": "https://news.ycombinator.com/item?id=40727755" 345 }, 346 { 347 "hn_id": "40755630", 348 "title": "Adversarial Perturbations Cannot Reliably Protect Artists from Generative AI", 349 "points": 4, 350 "comments": 1, 351 "url": "https://news.ycombinator.com/item?id=40755630" 352 }, 353 { 354 "hn_id": "41617735", 355 "title": "WaveletGPT: Wavelets Meet Large Language Models", 356 "points": 4, 357 "comments": 0, 358 "url": "https://news.ycombinator.com/item?id=41617735" 359 }, 360 { 361 "hn_id": "39764168", 362 "title": "A tweezer array with 6100 highly coherent atomic qubits", 363 "points": 3, 364 "comments": 0, 365 "url": "https://news.ycombinator.com/item?id=39764168" 366 }, 367 { 368 "hn_id": "40748080", 369 "title": "Adversarial Perturbations Cannot Reliably Protect Artists from Generative AI", 370 "points": 2, 371 "comments": 0, 372 "url": "https://news.ycombinator.com/item?id=40748080" 373 }, 374 { 375 "hn_id": "27612994", 376 "title": "LegoFormer: Transformers for Block-by-Block Multi-View 3D Reconstruction", 377 "points": 2, 378 "comments": 0, 379 "url": "https://news.ycombinator.com/item?id=27612994" 380 }, 381 { 382 "hn_id": "40855651", 383 "title": "Generalist Lightweight Model for Various Information Extraction Tasks", 384 "points": 1, 385 "comments": 0, 386 "url": "https://news.ycombinator.com/item?id=40855651" 387 } 388 ], 389 "top_points": 542, 390 "total_points": 632, 391 "total_comments": 122 392 } 393 }