scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (21410B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "JavaBench: A Benchmark of Object-Oriented Code Generation for Evaluating Large Language Models",
      6     "authors": [
      7       "Jialun Cao",
      8       "Zhiyong Chen",
      9       "Jiarong Wu",
     10       "S. Cheung",
     11       "Chang Xu"
     12     ],
     13     "year": 2024,
     14     "venue": "International Conference on Automated Software Engineering",
     15     "arxiv_id": "2406.12902",
     16     "doi": "10.1145/3691620.3695470"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The abstract states 'at most 41.17% Pass@5 in a more relaxed evaluation' but Section 4.1 Finding 4 reports 'The best average test-wise Pass@5 in JavaBench is 48.24%' — a significant numerical inconsistency suggesting the abstract was not updated with final results.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Comparative claims (holistic synthesis outperforms independent/incremental; selected context outperforms maximum/minimum) are supported by controlled ablation experiments in RQ1-RQ3 across five LLMs with consistent results.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Findings are presented broadly ('LLMs are far behind undergraduate students,' 'LLMs' capability to handle OOP features') based on only 5 models with no GPT-4 or frontier model coverage; the main findings text does not consistently bound claims to the studied models.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The student comparison is not controlled for testing conditions (students work iteratively with full documentation, LLMs make a fixed number of attempts); no alternative explanations for the LLM-student gap are considered.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly distinguishes Completion@k (syntactic completion), Compilation@k (compilability), and Pass@k (semantic correctness via test suite), and characterizes test coverage at 87-92%, clearly separating what is measured from broader code quality.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 5 'Threats to Validity' is a dedicated section discussing benchmark construction quality, LLM generalizability, prompt engineering variance, and data contamination.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats are named: only 5 LLMs studied due to 'time and hardware limits,' documentation quality affecting generation, prompt engineering variance, and confidentiality as contamination mitigation — concrete enough to be more than boilerplate.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The Threats section acknowledges the 5-LLM limitation but the main findings text still generalizes to 'LLMs' broadly without consistently stating which conclusions should not be extrapolated beyond the studied setting.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "There is no acknowledgments section, funding statement, or grant disclosure anywhere in the paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations (HKUST and Nanjing University) are clearly listed on the title page.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding is disclosed, so independence cannot be assessed.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement, financial interests declaration, or conflict-of-interest disclosure appears in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "OOP features (encapsulation, inheritance, polymorphism), context settings (maximum/minimum/selected), synthesis strategies (holistic/independent/incremental), and evaluation metrics (Completion/Compilation/Pass@k) are all explicitly defined.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The Contributions section explicitly lists three contributions: significance (first project-level Java OOP benchmark), novelty (systematic evaluation design), and evaluation (extensive experiments with findings).",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Table 1 systematically compares 24 existing benchmarks across language, granularity, and scale; Section 6 discusses how JavaBench extends, differs from, and complements ClassEval, DevEval, RepoEval, and OOPEval.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper argues that OOP features (inheritance, polymorphism) inherently require multi-class project contexts that function-level benchmarks cannot assess, making project-level Java a necessary condition for the claimed measurement.",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Table 3 reports cyclomatic and cognitive complexity per project; Table 2 shows human performance mean and standard deviation per project; however, there are only 4 projects with similar difficulty (90-95% human pass rate), providing limited range.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "The paper explicitly shows no ceiling effect (best LLM 48.24% test-wise Pass@5 vs 90.93% human) and notes project-wise evaluation yields all-zero results, indicating the benchmark is appropriately challenging without a floor problem at coarser granularity.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": true,
    142           "justification": "282 undergraduate students completed the four projects over four academic years, achieving a 90.93/100 average pass rate; per-project breakdown with min/max/mean/std is provided in Table 2.",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "The paper explicitly argues execution-based Pass@k over similarity metrics (BLEU), defines the Pass@k formula, and justifies the class-wise and test-wise granularities as capturing nuanced partial success shadowed by project-wise all-zero evaluation.",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "Contamination resistance relies entirely on academic confidentiality of student assignments — no canary strings, temporal splits, or dynamic generation; the paper acknowledges this as a threat but offers no technical countermeasure.",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "The paper does not discuss how the benchmark will remain useful as LLMs improve, does not project a saturation timeline, and does not provide an update or versioning plan.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "Section 4.4 analyzes five categories of benchmark failure modes in generated code: completion errors, inheritance errors, encapsulation errors, illegal inheritance, documentation non-following, and trivial implementations — with concrete code examples.",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Full implementation is publicly released at https://github.com/java-bench/JavaBench with a leaderboard at https://java-bench.github.io/leaderboard.html enabling reproduction of reported numbers.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Section 2 documents benchmark format, project descriptions, exercised Java concepts, test construction methodology, code coverage metrics, and human performance; Table 3 provides complete code and test statistics per project.",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "The GitHub URL is provided and access is clearly open, but no license is stated in the paper, leaving the terms of reuse legally ambiguous.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "The paper describes what the benchmark measures (project-level Java OOP code generation) but does not specify what should NOT be concluded from benchmark results or warn against misuse cases.",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "LLMs are far behind undergraduates on project-level Java OOP tasks: the best LLM achieves at most 48.24% test-wise Pass@5 vs 90.93% for undergraduates, and no LLM correctly completes any project in project-wise evaluation.",
    203       "evidence": "Table 5 and Section 4.1 Finding 4 report all-zero project-wise Pass@5 across all 5 LLMs; 48.24% best average test-wise Pass@5 vs 90.93% student average in Table 2.",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "95.8% of existing code generation benchmarks target Python, and only 5 involve Java, all at function level.",
    208       "evidence": "Table 1 surveys 24 benchmarks; 23/24 involve Python; only Concode, HumanEval-X, MBXP, MultiPL-MBPP, CoderEval involve Java at function level.",
    209       "supported": "strong"
    210     },
    211     {
    212       "claim": "Holistic synthesis (generating all methods in a class in one pass) consistently outperforms independent and incremental synthesis strategies.",
    213       "evidence": "Table 5 shows holistic achieves best Completion@1 (91.73%), Compilation@1 (72.33%), and Pass@1 (70.92%) averaged across all LLMs; consistent across all 5 models.",
    214       "supported": "strong"
    215     },
    216     {
    217       "claim": "Selected context (providing only method signatures of dependent classes) strikes the optimal balance, outperforming both maximum and minimum context settings.",
    218       "evidence": "Table 6 shows selected context achieves 70.92% class-wise Pass@1 vs 64.56% maximum and 37.47% minimum; minimum context produces near-zero test-wise pass rates.",
    219       "supported": "strong"
    220     },
    221     {
    222       "claim": "Providing too much context (maximum) or too little context (minimum) both degrade project-level code generation performance.",
    223       "evidence": "Table 6 shows minimum context yields near-zero test-wise Pass@1 across all models; maximum context improves some models but degrades others (e.g., DeepSeek-33b drops 23.62pp on P4).",
    224       "supported": "moderate"
    225     },
    226     {
    227       "claim": "AssertionFailedError and IllegalArgumentException account for 76.63% of test-failing errors in LLM-generated code.",
    228       "evidence": "Figure 6 exception distribution analysis reports AssertionFailedError at 50.75% and IllegalArgumentException at 25.88% of test failures.",
    229       "supported": "strong"
    230     }
    231   ],
    232   "methodology_tags": [
    233     "benchmark-eval",
    234     "observational"
    235   ],
    236   "key_findings": "JavaBench demonstrates a substantial gap between LLM and human performance on project-level Java OOP code generation: the best LLM (DeepSeek-33b) achieves 48.24% test-wise Pass@5 versus 90.93% for undergraduates, with all LLMs scoring 0% on project-wise evaluation. Holistic synthesis (generating all methods in a class simultaneously) outperforms independent and incremental strategies across all five evaluated LLMs. The benchmark identifies three main error types in LLM-generated Java: completion failures, OOP-specific compilation errors (inheritance, encapsulation, polymorphism violations), and test failures from documentation non-following and trivial implementations. Selected context (method signatures of dependent types only) provides the optimal balance between information richness and input token efficiency.",
    237   "red_flags": [
    238     {
    239       "flag": "Abstract-body number discrepancy",
    240       "detail": "The abstract states 'at most 41.17% Pass@5 in a more relaxed evaluation' but Section 4.1 Finding 4 reports 'The best average test-wise Pass@5 in JavaBench is 48.24%'. These numbers are inconsistent, suggesting the abstract was not updated with final experimental results."
    241     },
    242     {
    243       "flag": "Extremely small benchmark scale",
    244       "detail": "JavaBench contains only 4 projects, making it difficult to draw statistically robust conclusions about LLM capabilities; performance differences between projects are substantial (e.g., P2 test-wise Pass@1 near 0% for multiple models)."
    245     },
    246     {
    247       "flag": "LLM coverage excludes frontier models",
    248       "detail": "Only 5 LLMs are evaluated, with GPT-3.5 as the largest closed-source model; GPT-4, Claude, and Gemini are absent due to resource constraints, limiting the benchmark's ability to characterize the full capability range."
    249     },
    250     {
    251       "flag": "No technical contamination resistance",
    252       "detail": "Contamination mitigation relies entirely on academic confidentiality of student assignments (2019-2022), with no canary strings, temporal splits, or dynamic generation — the confidentiality claim is unverifiable."
    253     },
    254     {
    255       "flag": "No funding disclosure",
    256       "detail": "The paper includes no acknowledgments, funding statement, or grant information, making it impossible to assess potential conflicts of interest."
    257     },
    258     {
    259       "flag": "Benchmark license unspecified",
    260       "detail": "The paper provides a GitHub URL for the benchmark but does not specify any license, leaving reuse terms legally ambiguous."
    261     }
    262   ],
    263   "cited_papers": [
    264     {
    265       "title": "HumanEval: Evaluating Large Language Models Trained on Code",
    266       "relevance": "The primary baseline code generation benchmark that JavaBench is compared against and positioned to extend to project-level Java OOP."
    267     },
    268     {
    269       "title": "ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-level Code Generation",
    270       "relevance": "The most directly comparable predecessor benchmark at class-level Python; JavaBench extends to project-level Java and adopts similar synthesis strategy designs."
    271     },
    272     {
    273       "title": "DevEval: A Manually-Annotated Code Generation Benchmark Aligned with Real-World Code Repositories",
    274       "relevance": "Another project-level Python benchmark that JavaBench is compared against in Table 1 to establish the gap in Java project-level evaluation."
    275     },
    276     {
    277       "title": "OOP: Object-Oriented Programming Evaluation Benchmark for Large Language Models",
    278       "relevance": "The only prior benchmark claiming to test OOP features, which JavaBench critiques for not providing actual code context — only OOP concepts in prompts."
    279     },
    280     {
    281       "title": "RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation",
    282       "relevance": "A key RAG-based approach for project-level code completion that the selected context design partially complements."
    283     },
    284     {
    285       "title": "CrossCodeEval: A Diverse and Multilingual Benchmark for Cross-File Code Completion",
    286       "relevance": "A multilingual benchmark including Java that JavaBench is distinguished from by providing project-level rather than statement-level evaluation."
    287     },
    288     {
    289       "title": "Time Travel in LLMs: Tracing Data Contamination in Large Language Models",
    290       "relevance": "Referenced as the basis for contamination concerns that JavaBench attempts to mitigate through academic confidentiality."
    291     },
    292     {
    293       "title": "Lost in the Middle: How Language Models Use Long Contexts",
    294       "relevance": "Supports the motivation for selective context design — finding that LLMs fail to use information in the middle of long contexts, relevant to JavaBench's context setting ablation."
    295     }
    296   ],
    297   "engagement_factors": {
    298     "practical_relevance": {
    299       "score": 2,
    300       "justification": "Engineers choosing LLMs for Java enterprise development and researchers designing prompting strategies can directly apply the context selection and synthesis strategy findings."
    301     },
    302     "surprise_contrarian": {
    303       "score": 2,
    304       "justification": "The finding that method signatures only (not full context) outperforms providing maximum context is counterintuitive and practically actionable; the severity of the LLM-student gap on what seems like a student-grade assignment is striking."
    305     },
    306     "fear_safety": {
    307       "score": 0,
    308       "justification": "No AI safety or risk concerns are raised; the paper focuses on capability evaluation with no threat modeling."
    309     },
    310     "drama_conflict": {
    311       "score": 1,
    312       "justification": "The LLMs-vs-undergraduates framing has mild drama potential, but the domain (student Java assignments) limits headline appeal."
    313     },
    314     "demo_ability": {
    315       "score": 3,
    316       "justification": "A public leaderboard (java-bench.github.io/leaderboard.html) and GitHub repository allow anyone to immediately test their model and compare results."
    317     },
    318     "brand_recognition": {
    319       "score": 1,
    320       "justification": "HKUST and Nanjing University are reputable academic institutions but not AI-brand-name labs; no famous product or company association."
    321     }
    322   },
    323   "hn_data": {
    324     "threads": [
    325       {
    326         "hn_id": "39483482",
    327         "title": "Show HN: OK-Robot: open, modular home robot framework for pick-and-drop anywhere",
    328         "points": 542,
    329         "comments": 110,
    330         "url": "https://news.ycombinator.com/item?id=39483482"
    331       },
    332       {
    333         "hn_id": "36475563",
    334         "title": "AudioPaLM: A large language model that can speak and listen",
    335         "points": 69,
    336         "comments": 11,
    337         "url": "https://news.ycombinator.com/item?id=36475563"
    338       },
    339       {
    340         "hn_id": "40727755",
    341         "title": "Adversarial Perturbations Cannot Reliably Protect Artists from Generative AI",
    342         "points": 5,
    343         "comments": 0,
    344         "url": "https://news.ycombinator.com/item?id=40727755"
    345       },
    346       {
    347         "hn_id": "40755630",
    348         "title": "Adversarial Perturbations Cannot Reliably Protect Artists from Generative AI",
    349         "points": 4,
    350         "comments": 1,
    351         "url": "https://news.ycombinator.com/item?id=40755630"
    352       },
    353       {
    354         "hn_id": "41617735",
    355         "title": "WaveletGPT: Wavelets Meet Large Language Models",
    356         "points": 4,
    357         "comments": 0,
    358         "url": "https://news.ycombinator.com/item?id=41617735"
    359       },
    360       {
    361         "hn_id": "39764168",
    362         "title": "A tweezer array with 6100 highly coherent atomic qubits",
    363         "points": 3,
    364         "comments": 0,
    365         "url": "https://news.ycombinator.com/item?id=39764168"
    366       },
    367       {
    368         "hn_id": "40748080",
    369         "title": "Adversarial Perturbations Cannot Reliably Protect Artists from Generative AI",
    370         "points": 2,
    371         "comments": 0,
    372         "url": "https://news.ycombinator.com/item?id=40748080"
    373       },
    374       {
    375         "hn_id": "27612994",
    376         "title": "LegoFormer: Transformers for Block-by-Block Multi-View 3D Reconstruction",
    377         "points": 2,
    378         "comments": 0,
    379         "url": "https://news.ycombinator.com/item?id=27612994"
    380       },
    381       {
    382         "hn_id": "40855651",
    383         "title": "Generalist Lightweight Model for Various Information Extraction Tasks",
    384         "points": 1,
    385         "comments": 0,
    386         "url": "https://news.ycombinator.com/item?id=40855651"
    387       }
    388     ],
    389     "top_points": 542,
    390     "total_points": 632,
    391     "total_comments": 122
    392   }
    393 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs