scan-v4.json (35593B)
1 { 2 "scan_version": 4, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Evaluating Diverse Large Language Models for Automatic and General Bug Reproduction", 6 "authors": [ 7 "Sungmin Kang", 8 "Juyeon Yoon", 9 "Nargiz Askarbekkyzy", 10 "Shin Yoo" 11 ], 12 "year": 2023, 13 "venue": "IEEE Transactions on Software Engineering", 14 "arxiv_id": "2311.04532", 15 "doi": "10.1109/TSE.2024.3450837" 16 }, 17 "checklist": { 18 "claims_and_evidence": { 19 "abstract_claims_supported": { 20 "applies": true, 21 "answer": true, 22 "justification": "Abstract claims — one-third reproduction rate (251/750 = 33.5%, Table 5), StarCoder at 70% of Codex (Section 6.4.1), 90% on GHRB (Section 6.4.2), performance improving with size (Figure 8a) — are all supported by experimental results.", 23 "source": "opus" 24 }, 25 "causal_claims_justified": { 26 "applies": true, 27 "answer": true, 28 "justification": "Causal claims like 'fine-tuning on natural language can hurt performance' are supported by controlled comparisons within the same model family (StarCoder vs StarCoderPlus, Bloom vs BloomZ), where training data/technique is the only difference. Ablation of prompt components (Table 4) also follows single-variable manipulation.", 29 "source": "opus" 30 }, 31 "generalization_bounded": { 32 "applies": true, 33 "answer": true, 34 "justification": "The paper bounds claims to Java (all benchmarks are Java), acknowledges Defects4J may be in training data (Section 4.1), creates GHRB to verify generalization, and notes project-specific variation (Table 5). Title uses 'Evaluating' rather than claiming universal capability.", 35 "source": "opus" 36 }, 37 "alternative_explanations_discussed": { 38 "applies": true, 39 "answer": true, 40 "justification": "Section 7.2 analyzes whether LIBRO's performance is due to code extraction from reports vs genuine synthesis. Section 4.1 and RQ3/RQ4-2 address whether results are due to LLM memorization vs actual capability, verified with GHRB and membership tests.", 41 "source": "opus" 42 }, 43 "proxy_outcome_distinction": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper directly measures bug reproduction (test fails on buggy version, passes on fixed version) and frames results as bug reproduction performance. The measurement matches the claim with no proxy gap.", 47 "source": "opus" 48 } 49 }, 50 "limitations_and_scope": { 51 "limitations_section_present": { 52 "applies": true, 53 "answer": false, 54 "justification": "No dedicated 'Limitations' or 'Threats to Validity' section exists. The paper's sections are: Introduction, Motivation, Approach, Evaluation, Research Questions, Results, Discussion, Related Work, Conclusion. Limitations are scattered across RQ discussions but not collected in a dedicated section.", 55 "source": "opus" 56 }, 57 "threats_to_validity_specific": { 58 "applies": true, 59 "answer": true, 60 "justification": "Specific threats are discussed throughout: Defects4J likely in training data (Section 4.1), GHRB is small (31 bugs), Checkstyle failures due to external file dependency (Section 7.1), ChatGPT behavior changes breaking the pipeline (Section 6.4.3), and temperature sensitivity (Section 6.4.5). These are specific to this study.", 61 "source": "opus" 62 }, 63 "scope_boundaries_stated": { 64 "applies": true, 65 "answer": false, 66 "justification": "No explicit scope boundary statements about what the results do NOT show. The evaluation is limited to Java and specific benchmarks, but this is implicit rather than explicitly stated as a boundary. The paper doesn't have a 'what this does not show' discussion.", 67 "source": "opus" 68 } 69 }, 70 "conflicts_of_interest": { 71 "funding_disclosed": { 72 "applies": true, 73 "answer": false, 74 "justification": "No funding source or acknowledgments section is present in the paper text. There is no mention of grants, corporate sponsors, or funding agencies.", 75 "source": "opus" 76 }, 77 "affiliations_disclosed": { 78 "applies": true, 79 "answer": true, 80 "justification": "Author affiliations are clearly disclosed: all authors are from KAIST (Korea Advanced Institute of Science and Technology). They are academic researchers not affiliated with any LLM company they evaluate.", 81 "source": "opus" 82 }, 83 "funder_independent_of_outcome": { 84 "applies": true, 85 "answer": false, 86 "justification": "No funding is disclosed, so independence of the funder cannot be assessed. The absence of any funding disclosure makes this unanswerable.", 87 "source": "opus" 88 }, 89 "financial_interests_declared": { 90 "applies": true, 91 "answer": false, 92 "justification": "No competing interests or financial interests statement is present in the paper.", 93 "source": "opus" 94 } 95 }, 96 "scope_and_framing": { 97 "key_terms_defined": { 98 "applies": true, 99 "answer": true, 100 "justification": "BRT (Bug Reproducing Test) is defined operationally as a test that fails on the buggy version and passes on the fixed version; FIB (Fail In Buggy) is defined; the report-to-test task is explicitly characterized with examples.", 101 "source": "haiku" 102 }, 103 "intended_contribution_clear": { 104 "applies": true, 105 "answer": true, 106 "justification": "The paper clearly identifies four new contributions over the prior ICSE paper: GPU-performance tradeoff analysis, LLM size effects, ChatGPT behavioral change analysis, and self-consistency evaluation across LLMs.", 107 "source": "haiku" 108 }, 109 "engagement_with_prior_work": { 110 "applies": true, 111 "answer": true, 112 "justification": "Section 8 explicitly contrasts LIBRO with crash reproduction techniques (EvoCrash, JCharming, Botsing), test generation approaches (EvoSuite, CodaMosa), and code synthesis work (AlphaCode, CodeT), explaining mechanistic differences.", 113 "source": "haiku" 114 } 115 } 116 }, 117 "type_checklist": { 118 "empirical": { 119 "artifacts": { 120 "code_released": { 121 "applies": true, 122 "answer": true, 123 "justification": "The authors provide both the tool (https://github.com/coinse/libro) and the replication package for the journal extension (https://github.com/coinse/libro-journal-artifact), mentioned in Sections 1 and 4.3.", 124 "source": "opus" 125 }, 126 "data_released": { 127 "applies": true, 128 "answer": true, 129 "justification": "Defects4J is a public benchmark. The GHRB dataset is described and the experimental data and analysis scripts are publicly available (Section 1). The replication package includes experimental data.", 130 "source": "opus" 131 }, 132 "environment_specified": { 133 "applies": true, 134 "answer": true, 135 "justification": "Section 4.3 specifies: Ubuntu 18.04.6 LTS with 32GB RAM and Intel i7-7700 CPU for test execution; Ubuntu 20.04.6 LTS with 16 Xeon Gold 5222 CPUs and 4 NVIDIA RTX 3090 GPUs (96GB VRAM) for LLM inference. Python 3.9 with javalang library are stated.", 136 "source": "opus" 137 }, 138 "reproduction_instructions": { 139 "applies": true, 140 "answer": true, 141 "justification": "A dedicated replication package is provided at https://github.com/coinse/libro-journal-artifact with experimental data and analysis scripts. The tool repository is also public.", 142 "source": "opus" 143 } 144 }, 145 "statistical_methodology": { 146 "confidence_intervals_or_error_bars": { 147 "applies": true, 148 "answer": true, 149 "justification": "Figure 3 reports 50%, 80%, and 95% intervals from 1000-run simulations of generation attempts to performance. Table 4 reports 5th percentile, median, and 95th percentile for the two-example n=10 setting sampled from n=50 results.", 150 "source": "opus" 151 }, 152 "significance_tests": { 153 "applies": true, 154 "answer": false, 155 "justification": "No statistical significance tests are used. Comparisons between LLMs (Figure 6) and settings (Table 4) are based on raw number comparisons without p-values, t-tests, or any formal hypothesis testing.", 156 "source": "opus" 157 }, 158 "effect_sizes_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "The paper reports relative performance ratios (StarCoder at 70% of Codex on Defects4J, 90% on GHRB), absolute counts (251/750 = 33.5%), and percentage comparisons consistently throughout Section 6.", 162 "source": "opus" 163 }, 164 "sample_size_justified": { 165 "applies": true, 166 "answer": false, 167 "justification": "No justification is provided for why 750 Defects4J bugs or 31 GHRB bugs are sufficient sample sizes. No power analysis is discussed. The GHRB dataset is particularly small at 31 bugs.", 168 "source": "opus" 169 }, 170 "variance_reported": { 171 "applies": true, 172 "answer": false, 173 "justification": "Main LLM comparison results (Figure 6, Table 5) are single-run point estimates without variance measures. While Figure 3 and Table 4 show intervals for subsampling simulations, the core comparative results across models lack standard deviations or spread measures.", 174 "source": "opus" 175 } 176 }, 177 "evaluation_design": { 178 "baselines_included": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 6.1.2 compares LIBRO against EvoCrash (state-of-the-art crash reproduction) and a Copy&Paste baseline that extracts code snippets from bug reports (Figure 2).", 182 "source": "opus" 183 }, 184 "baselines_contemporary": { 185 "applies": true, 186 "answer": true, 187 "justification": "EvoCrash was the state-of-the-art crash reproduction technique at the time. The authors note there are no existing general bug reproduction techniques, making EvoCrash the most relevant comparison.", 188 "source": "opus" 189 }, 190 "ablation_study": { 191 "applies": true, 192 "answer": true, 193 "justification": "Table 4 presents systematic ablation of prompt components: no example, one example, two examples, within-project examples, constructor info, stack traces. Temperature ablation is in Section 6.4.5.", 194 "source": "opus" 195 }, 196 "multiple_metrics": { 197 "applies": true, 198 "answer": true, 199 "justification": "Multiple metrics are used: number of bugs reproduced, FIB count, ROC-AUC for selection (Figure 4), acc@n and precision@n for ranking (Table 7), wef@n for wasted effort, and GPU memory usage (Figure 7).", 200 "source": "opus" 201 }, 202 "human_evaluation": { 203 "applies": true, 204 "answer": false, 205 "justification": "All evaluation is automated (fail on buggy version, pass on fixed version). No human evaluation of test quality, readability, or developer usefulness is performed, despite the paper's claims about reducing developer effort.", 206 "source": "opus" 207 }, 208 "held_out_test_set": { 209 "applies": true, 210 "answer": true, 211 "justification": "The GHRB dataset (Section 4.1) was specifically collected from PRs after the Codex training data cutoff, serving as a held-out evaluation set to mitigate data leakage concerns.", 212 "source": "opus" 213 }, 214 "per_category_breakdown": { 215 "applies": true, 216 "answer": true, 217 "justification": "Table 5 provides per-project breakdown of reproduction performance on Defects4J. Table 8 provides per-project breakdown for GHRB. Figure 6 shows per-LLM breakdown.", 218 "source": "opus" 219 }, 220 "failure_cases_discussed": { 221 "applies": true, 222 "answer": true, 223 "justification": "Section 7.1 presents a detailed failure case (Checkstyle Issue #11365, Listing 5) where LIBRO fails because the test references a non-existent file, illustrating the limitation of not being able to modify execution environments.", 224 "source": "opus" 225 }, 226 "negative_results_reported": { 227 "applies": true, 228 "answer": true, 229 "justification": "Several negative results: within-project examples hurt performance (Table 4, Section 6.1.1), fine-tuning on natural language hurts code tasks (StarCoderPlus and BloomZ, Section 6.4.1), Closure project shows poor performance, ChatGPT behavior change broke pipeline (Section 6.4.3).", 230 "source": "opus" 231 } 232 }, 233 "setup_transparency": { 234 "model_versions_specified": { 235 "applies": true, 236 "answer": true, 237 "justification": "Table 3 lists exact model identifiers: code-davinci-002, text-davinci-003, gpt-3.5-turbo-0301, gpt-3.5-turbo-0613, Bloom-176B, BloomZ-176B, Incoder-(1,6)B, CodeGen2-(1,3.7,7,16)B, StarCoder-15B, StarCoderBase-15B, StarCoderPlus-15B with sizes and release years.", 238 "source": "opus" 239 }, 240 "prompts_provided": { 241 "applies": true, 242 "answer": true, 243 "justification": "Listing 1 shows the complete prompt format used, including the Markdown structure, reproduction instruction, and code block initiation. The full prompt text is provided, not just a description.", 244 "source": "opus" 245 }, 246 "hyperparameters_reported": { 247 "applies": true, 248 "answer": true, 249 "justification": "Section 4.3 states: temperature 0.7 (default), maximum generated tokens 256, n=10 or n=50 test samples. Temperature is further explored across 7 values in Section 6.4.5.", 250 "source": "opus" 251 }, 252 "scaffolding_described": { 253 "applies": true, 254 "answer": true, 255 "justification": "The LIBRO pipeline is described in detail: Section 3 provides four stages (prompt engineering, LLM querying, postprocessing, selection & ranking) with formal algorithms (Algorithm 1 for test postprocessing, Algorithm 2 for selection and ranking) and worked examples.", 256 "source": "opus" 257 }, 258 "data_preprocessing_documented": { 259 "applies": true, 260 "answer": true, 261 "justification": "Section 4.1 documents filtering: Defects4J 814 → 750 bugs (58 excluded for poor report-bug mapping, 6 for structural changes). GHRB: 581 PRs → 435 (non-test-introducing removed) → 84 (non-merged or multi-issue removed) → 31 (verified BRT). Each step has explicit criteria.", 262 "source": "opus" 263 } 264 }, 265 "data_integrity": { 266 "raw_data_available": { 267 "applies": true, 268 "answer": true, 269 "justification": "Experimental data and analysis scripts are publicly available via the replication package (https://github.com/coinse/libro-journal-artifact), enabling independent verification of results.", 270 "source": "opus" 271 }, 272 "data_collection_described": { 273 "applies": true, 274 "answer": true, 275 "justification": "Section 4.1 describes Defects4J v2.0 collection and filtering in detail. GHRB collection from 17 GitHub repositories is described step-by-step, including the criteria for PR selection and verification process.", 276 "source": "opus" 277 }, 278 "recruitment_methods_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants. Data sources are standard benchmarks (Defects4J) and publicly available GitHub repositories (GHRB).", 282 "source": "opus" 283 }, 284 "data_pipeline_documented": { 285 "applies": true, 286 "answer": true, 287 "justification": "Section 4.1 documents: Defects4J 814 bugs → 58 excluded (poor mapping) → 6 excluded (structural differences) → 750. GHRB: 581 PRs → 435 (test-introducing) → 84 (merged, single-issue) → 31 (verified BRT). Each stage has counts and criteria.", 288 "source": "opus" 289 } 290 }, 291 "contamination": { 292 "training_cutoff_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "The paper references 'the Codex training data cutoff point' (Section 4.1) without stating the actual date. For other LLMs (StarCoder, CodeGen2, etc.), no training cutoff dates are explicitly stated.", 296 "source": "opus" 297 }, 298 "train_test_overlap_discussed": { 299 "applies": true, 300 "answer": true, 301 "justification": "Extensively discussed. Section 4.1 cites Lee et al. showing Defects4J BRTs are in StarCoder training data. The GHRB dataset was created specifically to mitigate this. StarCoder's dataset membership test was used to verify GHRB tests are not in the Stack dataset.", 302 "source": "opus" 303 }, 304 "benchmark_contamination_addressed": { 305 "applies": true, 306 "answer": true, 307 "justification": "The paper acknowledges Defects4J is likely in most LLM training data (Section 4.1) and creates the GHRB held-out dataset after the Codex training cutoff. They verify using StarCoder's dataset membership test that GHRB reproducing tests are not in the Stack dataset.", 308 "source": "opus" 309 } 310 }, 311 "human_studies": { 312 "pre_registered": { 313 "applies": false, 314 "answer": false, 315 "justification": "No human participants in this study. All evaluation is automated against benchmark programs.", 316 "source": "opus" 317 }, 318 "irb_or_ethics_approval": { 319 "applies": false, 320 "answer": false, 321 "justification": "No human participants. The study evaluates LLMs on software benchmarks.", 322 "source": "opus" 323 }, 324 "demographics_reported": { 325 "applies": false, 326 "answer": false, 327 "justification": "No human participants in the study.", 328 "source": "opus" 329 }, 330 "inclusion_exclusion_criteria": { 331 "applies": false, 332 "answer": false, 333 "justification": "No human participants in the study.", 334 "source": "opus" 335 }, 336 "randomization_described": { 337 "applies": false, 338 "answer": false, 339 "justification": "No human participants in the study.", 340 "source": "opus" 341 }, 342 "blinding_described": { 343 "applies": false, 344 "answer": false, 345 "justification": "No human participants in the study.", 346 "source": "opus" 347 }, 348 "attrition_reported": { 349 "applies": false, 350 "answer": false, 351 "justification": "No human participants in the study.", 352 "source": "opus" 353 } 354 }, 355 "cost_and_practicality": { 356 "inference_cost_reported": { 357 "applies": true, 358 "answer": true, 359 "justification": "Table 6 reports per-step timing: API querying 5.85s, postprocessing 1.23s, test execution 4.00s, total 444s for 50 tests. Figure 7 plots GPU memory usage per model. Section 1 notes 'more than eight months of GPU time and seven months of CPU time.'", 360 "source": "opus" 361 }, 362 "compute_budget_stated": { 363 "applies": true, 364 "answer": true, 365 "justification": "The paper states the experiments required 'more than eight months of GPU time and seven months of CPU time' (Section 1). Section 4.3 specifies the hardware: 4 NVIDIA RTX 3090 GPUs with 96GB total VRAM.", 366 "source": "opus" 367 } 368 }, 369 "experimental_rigor": { 370 "seed_sensitivity_reported": { 371 "applies": true, 372 "answer": false, 373 "justification": "Main LLM comparison results (Figure 6) are single-run results without reporting sensitivity to random seeds. The 1000-run subsampling simulation (Figure 3) shows variance from subsampling but not from different experimental seeds.", 374 "source": "opus" 375 }, 376 "number_of_runs_stated": { 377 "applies": true, 378 "answer": true, 379 "justification": "Section 4.3 explicitly states 'we sample 10 tests (denoted by n=10)' and 'experimenting with sampling 50 tests as well.' The 1000-run simulation count is also stated. Temperature experiments state the number of configurations.", 380 "source": "opus" 381 }, 382 "hyperparameter_search_budget": { 383 "applies": true, 384 "answer": false, 385 "justification": "While 7 temperature values and multiple prompt configurations are explored as research questions (Table 4, Figure 8b), no formal hyperparameter search budget is reported. The search method and total compute for configuration selection are not described.", 386 "source": "opus" 387 }, 388 "best_config_selection_justified": { 389 "applies": true, 390 "answer": true, 391 "justification": "Table 4 shows all prompt configurations and their results. Figure 8b shows all temperature values tested. The two-example n=50 setting is chosen based on best observed performance, and all alternatives are transparently reported.", 392 "source": "opus" 393 }, 394 "multiple_comparison_correction": { 395 "applies": true, 396 "answer": false, 397 "justification": "The paper compares 15 LLMs, multiple prompt configurations, and 7 temperature settings without any statistical tests, let alone corrections for multiple comparisons.", 398 "source": "opus" 399 }, 400 "self_comparison_bias_addressed": { 401 "applies": true, 402 "answer": false, 403 "justification": "The authors evaluate their own LIBRO system against baselines (including their own implementation of Copy&Paste) without acknowledging potential author-evaluation bias. No independent evaluation or mitigation strategy is discussed.", 404 "source": "opus" 405 }, 406 "compute_budget_vs_performance": { 407 "applies": true, 408 "answer": true, 409 "justification": "Figure 7 explicitly plots GPU memory usage against reproduction performance for all open-source LLMs, with a Pareto front analysis. Each model is mapped to a specific GPU count, helping practitioners make resource-informed decisions.", 410 "source": "opus" 411 }, 412 "benchmark_construct_validity": { 413 "applies": true, 414 "answer": false, 415 "justification": "The paper does not discuss whether Defects4J or GHRB actually measure real-world bug reproduction capability. They define BRT precisely but do not question whether success on these benchmarks translates to practical developer benefit.", 416 "source": "opus" 417 }, 418 "scaffold_confound_addressed": { 419 "applies": true, 420 "answer": true, 421 "justification": "All LLMs are evaluated using the same LIBRO pipeline (prompt format, postprocessing, ranking), controlling for scaffold differences. The comparison is strictly between models within the same framework.", 422 "source": "opus" 423 } 424 }, 425 "data_leakage": { 426 "temporal_leakage_addressed": { 427 "applies": true, 428 "answer": true, 429 "justification": "Section 4.1 creates GHRB from PRs after the Codex training data cutoff. They verify using StarCoder's dataset membership test that GHRB tests are not in the Stack training dataset.", 430 "source": "opus" 431 }, 432 "feature_leakage_addressed": { 433 "applies": true, 434 "answer": true, 435 "justification": "Section 3.1 notes 'our specific template format makes it highly unlikely that prompts we generate exist verbatim within the LLM training data' and discusses that bug reports are only connected to BRTs via chains of references, partly mitigating leakage.", 436 "source": "opus" 437 }, 438 "non_independence_addressed": { 439 "applies": true, 440 "answer": true, 441 "justification": "The paper acknowledges Defects4J is likely in training data (citing Lee et al.), creates GHRB from separate repositories post-cutoff, and verifies using StarCoder's dataset membership test that GHRB test code is not in the Stack dataset.", 442 "source": "opus" 443 }, 444 "leakage_detection_method": { 445 "applies": true, 446 "answer": true, 447 "justification": "Two concrete methods are used: (1) StarCoder's dataset membership test (Section 6.4.2) to verify GHRB tests are not in training data, and (2) temporal splits creating GHRB from post-cutoff PRs.", 448 "source": "opus" 449 } 450 } 451 } 452 }, 453 "claims": [ 454 { 455 "claim": "LIBRO reproduces 33.5% (251/750) of Defects4J bugs using code-davinci-002 with 50 test generation attempts", 456 "evidence": "Table 4 shows the two-example n=50 setting reproduces 251 bugs; Table 5 breaks this down by project", 457 "supported": "strong" 458 }, 459 { 460 "claim": "StarCoder-15B achieves 70% of Codex performance on Defects4J and 90% on the GHRB holdout dataset", 461 "evidence": "Figure 6a shows StarCoder reproduces 125 bugs vs Codex's 173 (n=10); RQ4-2 text states StarCoder achieves 90% of Codex performance on GHRB at n=50", 462 "supported": "strong" 463 }, 464 { 465 "claim": "Bug reproduction performance scales logarithmically with number of generation attempts with no plateau", 466 "evidence": "Figure 3 shows logarithmic relationship over 5 orders of magnitude (log scale), with no sign of plateauing", 467 "supported": "strong" 468 }, 469 { 470 "claim": "Fine-tuning code LLMs on natural language data hurts bug reproduction performance", 471 "evidence": "StarCoderPlus (natural language fine-tuned) substantially underperforms StarCoder in Figure 6; BloomZ similarly underperforms Bloom", 472 "supported": "moderate" 473 }, 474 { 475 "claim": "LIBRO's self-consistency-based selection algorithm generalizes across all tested LLMs and temperature settings", 476 "evidence": "Figure 9 shows consistent threshold-precision behavior across 8 LLMs; Figure 10 shows ROC-AUC stable at 0.76-0.80 across temperatures", 477 "supported": "strong" 478 }, 479 { 480 "claim": "Results on held-out GHRB bugs (verified not in training data) match Defects4J, suggesting LLMs are not merely memorizing", 481 "evidence": "RQ3 shows 32.2% reproduction on GHRB vs 33.5% on Defects4J; StarCoder membership test confirms GHRB test absence from Stack dataset", 482 "supported": "moderate" 483 }, 484 { 485 "claim": "Temperature 0.6 optimally balances generation diversity and coherence for bug reproduction", 486 "evidence": "Figure 8b peaks at T=0.6 for StarCoder; Table 11 shows best acc@n performance at T=0.6", 487 "supported": "strong" 488 } 489 ], 490 "methodology_tags": [ 491 "benchmark-eval" 492 ], 493 "key_findings": "LIBRO, a pipeline prompting LLMs with natural language bug reports to generate bug-reproducing tests, reproduces 33.5% of Defects4J bugs using Codex, substantially outperforming crash-only baselines and extending bug reproduction to non-crash bugs for the first time at scale. In a 15-LLM comparison, open-source StarCoder achieves 70-90% of proprietary Codex performance and similar results on a held-out post-training-cutoff dataset, demonstrating that results are not due to memorization. Key findings include: reproduction scales logarithmically with generation attempts, fine-tuning code LLMs on natural language hurts performance, and the self-consistency-based selection/ranking algorithm generalizes robustly across all LLMs and temperature settings tested.", 494 "red_flags": [ 495 { 496 "flag": "No statistical significance tests", 497 "detail": "All comparative claims (StarCoder vs Codex, ranking vs random baseline, prompt ablations) are made without significance tests despite the paper serving as practitioner guidance for LLM selection." 498 }, 499 { 500 "flag": "GHRB holdout is only 31 bugs", 501 "detail": "The key generalization experiment uses only 31 bugs across 6 projects; with Codex reproducing only 10, single-digit differences between LLMs cannot be reliably distinguished." 502 }, 503 { 504 "flag": "No variance on main results", 505 "detail": "LLM inference is stochastic; Tables 4-5 and 7-11 report only point estimates without standard deviations across independent runs, except for the random baseline simulation." 506 }, 507 { 508 "flag": "'General' in title but Java-only evaluation", 509 "detail": "The paper claims general bug reproduction but evaluates exclusively on Java/JUnit projects; generalization to other languages, ecosystems, or bug report formats is unverified and not bounded." 510 }, 511 { 512 "flag": "No dedicated threats-to-validity section", 513 "detail": "Validity threats are scattered inline (Section 4.1, Section 7) rather than consolidated, making it difficult to assess whether all material threats have been considered." 514 } 515 ], 516 "cited_papers": [ 517 { 518 "title": "Large Language Models are Few-Shot Testers: Exploring LLM-Based General Bug Reproduction (ICSE 2023)", 519 "relevance": "Prior conference version of this work; the journal paper extends it with the 15-LLM comparison and additional analyses." 520 }, 521 { 522 "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs", 523 "relevance": "Primary benchmark (750 bugs); standard in automated testing research and used for all main results." 524 }, 525 { 526 "title": "StarCoder: May the Source Be With You", 527 "relevance": "Best-performing open-source LLM in the study, achieving 70-90% of Codex performance and enabling low-cost deployment." 528 }, 529 { 530 "title": "Evaluating Large Language Models Trained on Code (Codex)", 531 "relevance": "Introduces code-davinci-002, the best-performing model in the study and the primary performance reference point." 532 }, 533 { 534 "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models", 535 "relevance": "Provides theoretical basis for LIBRO's selection algorithm: consensus among multiple stochastic generations predicts correctness." 536 }, 537 { 538 "title": "How is ChatGPT's Behavior Changing Over Time?", 539 "relevance": "Prior work on LLM behavioral drift that LIBRO partially replicates and reframes as prompt format sensitivity rather than capability degradation." 540 }, 541 { 542 "title": "The GitHub Recent Bugs Dataset for Evaluating LLM-Based Debugging Applications", 543 "relevance": "Companion paper describing the GHRB holdout dataset used to test for training data memorization." 544 }, 545 { 546 "title": "CodaMosa: Escaping Coverage Plateaus in Test Generation with Pre-Trained Large Language Models", 547 "relevance": "Concurrent work combining LLMs with search-based test generation, representing the closest related approach in the test generation literature." 548 } 549 ], 550 "engagement_factors": { 551 "practical_relevance": { 552 "score": 3, 553 "justification": "LIBRO is immediately usable via public GitHub release, and Figure 7's GPU-performance tradeoff chart directly guides practitioners in selecting an LLM given hardware constraints." 554 }, 555 "surprise_contrarian": { 556 "score": 2, 557 "justification": "The finding that open-source StarCoder achieves 90% of proprietary Codex performance on held-out data, and that fine-tuning on natural language hurts code tasks, challenge common assumptions about the API vs open-source gap." 558 }, 559 "fear_safety": { 560 "score": 0, 561 "justification": "No AI safety, risk, or adversarial concerns are raised; the paper is a straightforward software engineering capability evaluation." 562 }, 563 "drama_conflict": { 564 "score": 1, 565 "justification": "The paper reframes the 'ChatGPT is getting worse' narrative as a prompt-format sensitivity issue rather than capability degradation, offering a mild counterpoint to a widely discussed concern." 566 }, 567 "demo_ability": { 568 "score": 3, 569 "justification": "The tool is publicly available at GitHub and can be run by anyone on Java projects with Defects4J-compatible bug reports." 570 }, 571 "brand_recognition": { 572 "score": 1, 573 "justification": "Published in IEEE TSE (top venue) and evaluates prominent models from OpenAI, Meta, and BigCode, but the authoring institution (KAIST) is less prominent than top US/EU labs." 574 } 575 }, 576 "hn_data": { 577 "threads": [ 578 { 579 "hn_id": "38283398", 580 "title": "API-Driven Program Synthesis for Testing Static Typing Implementations", 581 "points": 35, 582 "comments": 1, 583 "url": "https://news.ycombinator.com/item?id=38283398", 584 "created_at": "2023-11-15T22:19:08Z" 585 }, 586 { 587 "hn_id": "42158451", 588 "title": "Convolutional Differentiable Logic Gate Networks", 589 "points": 26, 590 "comments": 4, 591 "url": "https://news.ycombinator.com/item?id=42158451", 592 "created_at": "2024-11-16T19:10:54Z" 593 }, 594 { 595 "hn_id": "39967245", 596 "title": "Formal Aspects of Language Modeling", 597 "points": 4, 598 "comments": 0, 599 "url": "https://news.ycombinator.com/item?id=39967245", 600 "created_at": "2024-04-08T07:47:56Z" 601 }, 602 { 603 "hn_id": "42115169", 604 "title": "Convolutional Differentiable Logic Gate Networks", 605 "points": 3, 606 "comments": 0, 607 "url": "https://news.ycombinator.com/item?id=42115169", 608 "created_at": "2024-11-12T13:04:29Z" 609 }, 610 { 611 "hn_id": "34101211", 612 "title": "Will we run out of data?", 613 "points": 3, 614 "comments": 0, 615 "url": "https://news.ycombinator.com/item?id=34101211", 616 "created_at": "2022-12-23T01:17:13Z" 617 }, 618 { 619 "hn_id": "25056202", 620 "title": "Learning Autocompletion from Real-World Datasets", 621 "points": 3, 622 "comments": 0, 623 "url": "https://news.ycombinator.com/item?id=25056202", 624 "created_at": "2020-11-11T07:17:33Z" 625 }, 626 { 627 "hn_id": "40939773", 628 "title": "Formal Aspects of Language Modeling", 629 "points": 2, 630 "comments": 0, 631 "url": "https://news.ycombinator.com/item?id=40939773", 632 "created_at": "2024-07-11T19:30:45Z" 633 }, 634 { 635 "hn_id": "42258010", 636 "title": "Gradient Boosting Trees and LLMs for Tabular Data Few-Shot Learning", 637 "points": 2, 638 "comments": 0, 639 "url": "https://news.ycombinator.com/item?id=42258010", 640 "created_at": "2024-11-27T17:46:47Z" 641 }, 642 { 643 "hn_id": "36985212", 644 "title": "Will we run out of data to train LLMs?", 645 "points": 2, 646 "comments": 0, 647 "url": "https://news.ycombinator.com/item?id=36985212", 648 "created_at": "2023-08-03T12:53:23Z" 649 }, 650 { 651 "hn_id": "40610622", 652 "title": "Will we run out of data? Limits of LLM scaling based on human-generated data", 653 "points": 1, 654 "comments": 1, 655 "url": "https://news.ycombinator.com/item?id=40610622", 656 "created_at": "2024-06-07T17:08:29Z" 657 } 658 ], 659 "top_points": 35, 660 "total_points": 81, 661 "total_comments": 6 662 } 663 }