ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (29589B)


      1 {
      2   "paper": {
      3     "title": "Evaluating Diverse Large Language Models for Automatic and General Bug Reproduction",
      4     "authors": [
      5       "Sungmin Kang",
      6       "Juyeon Yoon",
      7       "Nargiz Askarbekkyzy",
      8       "Shin Yoo"
      9     ],
     10     "year": 2023,
     11     "venue": "IEEE Transactions on Software Engineering",
     12     "arxiv_id": "2311.04532",
     13     "doi": "10.1109/TSE.2024.3450837"
     14   },
     15   "scan_version": 2,
     16   "active_modules": [
     17     "experimental_rigor",
     18     "data_leakage"
     19   ],
     20   "methodology_tags": [
     21     "benchmark-eval"
     22   ],
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The authors provide both the tool (https://github.com/coinse/libro) and the replication package for the journal extension (https://github.com/coinse/libro-journal-artifact), mentioned in Sections 1 and 4.3."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Defects4J is a public benchmark. The GHRB dataset is described and the experimental data and analysis scripts are publicly available (Section 1). The replication package includes experimental data."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Section 4.3 specifies: Ubuntu 18.04.6 LTS with 32GB RAM and Intel i7-7700 CPU for test execution; Ubuntu 20.04.6 LTS with 16 Xeon Gold 5222 CPUs and 4 NVIDIA RTX 3090 GPUs (96GB VRAM) for LLM inference. Python 3.9 with javalang library are stated."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "A dedicated replication package is provided at https://github.com/coinse/libro-journal-artifact with experimental data and analysis scripts. The tool repository is also public."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Figure 3 reports 50%, 80%, and 95% intervals from 1000-run simulations of generation attempts to performance. Table 4 reports 5th percentile, median, and 95th percentile for the two-example n=10 setting sampled from n=50 results."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No statistical significance tests are used. Comparisons between LLMs (Figure 6) and settings (Table 4) are based on raw number comparisons without p-values, t-tests, or any formal hypothesis testing."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The paper reports relative performance ratios (StarCoder at 70% of Codex on Defects4J, 90% on GHRB), absolute counts (251/750 = 33.5%), and percentage comparisons consistently throughout Section 6."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No justification is provided for why 750 Defects4J bugs or 31 GHRB bugs are sufficient sample sizes. No power analysis is discussed. The GHRB dataset is particularly small at 31 bugs."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "Main LLM comparison results (Figure 6, Table 5) are single-run point estimates without variance measures. While Figure 3 and Table 4 show intervals for subsampling simulations, the core comparative results across models lack standard deviations or spread measures."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Section 6.1.2 compares LIBRO against EvoCrash (state-of-the-art crash reproduction) and a Copy&Paste baseline that extracts code snippets from bug reports (Figure 2)."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "EvoCrash was the state-of-the-art crash reproduction technique at the time. The authors note there are no existing general bug reproduction techniques, making EvoCrash the most relevant comparison."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Table 4 presents systematic ablation of prompt components: no example, one example, two examples, within-project examples, constructor info, stack traces. Temperature ablation is in Section 6.4.5."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Multiple metrics are used: number of bugs reproduced, FIB count, ROC-AUC for selection (Figure 4), acc@n and precision@n for ranking (Table 7), wef@n for wasted effort, and GPU memory usage (Figure 7)."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "All evaluation is automated (fail on buggy version, pass on fixed version). No human evaluation of test quality, readability, or developer usefulness is performed, despite the paper's claims about reducing developer effort."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The GHRB dataset (Section 4.1) was specifically collected from PRs after the Codex training data cutoff, serving as a held-out evaluation set to mitigate data leakage concerns."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Table 5 provides per-project breakdown of reproduction performance on Defects4J. Table 8 provides per-project breakdown for GHRB. Figure 6 shows per-LLM breakdown."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 7.1 presents a detailed failure case (Checkstyle Issue #11365, Listing 5) where LIBRO fails because the test references a non-existent file, illustrating the limitation of not being able to modify execution environments."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Several negative results: within-project examples hurt performance (Table 4, Section 6.1.1), fine-tuning on natural language hurts code tasks (StarCoderPlus and BloomZ, Section 6.4.1), Closure project shows poor performance, ChatGPT behavior change broke pipeline (Section 6.4.3)."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Abstract claims — one-third reproduction rate (251/750 = 33.5%, Table 5), StarCoder at 70% of Codex (Section 6.4.1), 90% on GHRB (Section 6.4.2), performance improving with size (Figure 8a) — are all supported by experimental results."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Causal claims like 'fine-tuning on natural language can hurt performance' are supported by controlled comparisons within the same model family (StarCoder vs StarCoderPlus, Bloom vs BloomZ), where training data/technique is the only difference. Ablation of prompt components (Table 4) also follows single-variable manipulation."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper bounds claims to Java (all benchmarks are Java), acknowledges Defects4J may be in training data (Section 4.1), creates GHRB to verify generalization, and notes project-specific variation (Table 5). Title uses 'Evaluating' rather than claiming universal capability."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Section 7.2 analyzes whether LIBRO's performance is due to code extraction from reports vs genuine synthesis. Section 4.1 and RQ3/RQ4-2 address whether results are due to LLM memorization vs actual capability, verified with GHRB and membership tests."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper directly measures bug reproduction (test fails on buggy version, passes on fixed version) and frames results as bug reproduction performance. The measurement matches the claim with no proxy gap."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Table 3 lists exact model identifiers: code-davinci-002, text-davinci-003, gpt-3.5-turbo-0301, gpt-3.5-turbo-0613, Bloom-176B, BloomZ-176B, Incoder-(1,6)B, CodeGen2-(1,3.7,7,16)B, StarCoder-15B, StarCoderBase-15B, StarCoderPlus-15B with sizes and release years."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Listing 1 shows the complete prompt format used, including the Markdown structure, reproduction instruction, and code block initiation. The full prompt text is provided, not just a description."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 4.3 states: temperature 0.7 (default), maximum generated tokens 256, n=10 or n=50 test samples. Temperature is further explored across 7 values in Section 6.4.5."
    162       },
    163       "scaffolding_described": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "The LIBRO pipeline is described in detail: Section 3 provides four stages (prompt engineering, LLM querying, postprocessing, selection & ranking) with formal algorithms (Algorithm 1 for test postprocessing, Algorithm 2 for selection and ranking) and worked examples."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 4.1 documents filtering: Defects4J 814 → 750 bugs (58 excluded for poor report-bug mapping, 6 for structural changes). GHRB: 581 PRs → 435 (non-test-introducing removed) → 84 (non-merged or multi-issue removed) → 31 (verified BRT). Each step has explicit criteria."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No dedicated 'Limitations' or 'Threats to Validity' section exists. The paper's sections are: Introduction, Motivation, Approach, Evaluation, Research Questions, Results, Discussion, Related Work, Conclusion. Limitations are scattered across RQ discussions but not collected in a dedicated section."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Specific threats are discussed throughout: Defects4J likely in training data (Section 4.1), GHRB is small (31 bugs), Checkstyle failures due to external file dependency (Section 7.1), ChatGPT behavior changes breaking the pipeline (Section 6.4.3), and temperature sensitivity (Section 6.4.5). These are specific to this study."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "No explicit scope boundary statements about what the results do NOT show. The evaluation is limited to Java and specific benchmarks, but this is implicit rather than explicitly stated as a boundary. The paper doesn't have a 'what this does not show' discussion."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Experimental data and analysis scripts are publicly available via the replication package (https://github.com/coinse/libro-journal-artifact), enabling independent verification of results."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 4.1 describes Defects4J v2.0 collection and filtering in detail. GHRB collection from 17 GitHub repositories is described step-by-step, including the criteria for PR selection and verification process."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No human participants. Data sources are standard benchmarks (Defects4J) and publicly available GitHub repositories (GHRB)."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Section 4.1 documents: Defects4J 814 bugs → 58 excluded (poor mapping) → 6 excluded (structural differences) → 750. GHRB: 581 PRs → 435 (test-introducing) → 84 (merged, single-issue) → 31 (verified BRT). Each stage has counts and criteria."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding source or acknowledgments section is present in the paper text. There is no mention of grants, corporate sponsors, or funding agencies."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations are clearly disclosed: all authors are from KAIST (Korea Advanced Institute of Science and Technology). They are academic researchers not affiliated with any LLM company they evaluate."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No funding is disclosed, so independence of the funder cannot be assessed. The absence of any funding disclosure makes this unanswerable."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial interests statement is present in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The paper references 'the Codex training data cutoff point' (Section 4.1) without stating the actual date. For other LLMs (StarCoder, CodeGen2, etc.), no training cutoff dates are explicitly stated."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "Extensively discussed. Section 4.1 cites Lee et al. showing Defects4J BRTs are in StarCoder training data. The GHRB dataset was created specifically to mitigate this. StarCoder's dataset membership test was used to verify GHRB tests are not in the Stack dataset."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": true,
    249         "justification": "The paper acknowledges Defects4J is likely in most LLM training data (Section 4.1) and creates the GHRB held-out dataset after the Codex training cutoff. They verify using StarCoder's dataset membership test that GHRB reproducing tests are not in the Stack dataset."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study. All evaluation is automated against benchmark programs."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants. The study evaluates LLMs on software benchmarks."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in the study."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in the study."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in the study."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in the study."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in the study."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Table 6 reports per-step timing: API querying 5.85s, postprocessing 1.23s, test execution 4.00s, total 444s for 50 tests. Figure 7 plots GPU memory usage per model. Section 1 notes 'more than eight months of GPU time and seven months of CPU time.'"
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": true,
    298         "justification": "The paper states the experiments required 'more than eight months of GPU time and seven months of CPU time' (Section 1). Section 4.3 specifies the hardware: 4 NVIDIA RTX 3090 GPUs with 96GB total VRAM."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "Main LLM comparison results (Figure 6) are single-run results without reporting sensitivity to random seeds. The 1000-run subsampling simulation (Figure 3) shows variance from subsampling but not from different experimental seeds."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Section 4.3 explicitly states 'we sample 10 tests (denoted by n=10)' and 'experimenting with sampling 50 tests as well.' The 1000-run simulation count is also stated. Temperature experiments state the number of configurations."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "While 7 temperature values and multiple prompt configurations are explored as research questions (Table 4, Figure 8b), no formal hyperparameter search budget is reported. The search method and total compute for configuration selection are not described."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": true,
    320         "justification": "Table 4 shows all prompt configurations and their results. Figure 8b shows all temperature values tested. The two-example n=50 setting is chosen based on best observed performance, and all alternatives are transparently reported."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The paper compares 15 LLMs, multiple prompt configurations, and 7 temperature settings without any statistical tests, let alone corrections for multiple comparisons."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors evaluate their own LIBRO system against baselines (including their own implementation of Copy&Paste) without acknowledging potential author-evaluation bias. No independent evaluation or mitigation strategy is discussed."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "Figure 7 explicitly plots GPU memory usage against reproduction performance for all open-source LLMs, with a Pareto front analysis. Each model is mapped to a specific GPU count, helping practitioners make resource-informed decisions."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "The paper does not discuss whether Defects4J or GHRB actually measure real-world bug reproduction capability. They define BRT precisely but do not question whether success on these benchmarks translates to practical developer benefit."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": true,
    344         "answer": true,
    345         "justification": "All LLMs are evaluated using the same LIBRO pipeline (prompt format, postprocessing, ranking), controlling for scaffold differences. The comparison is strictly between models within the same framework."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": true,
    352         "justification": "Section 4.1 creates GHRB from PRs after the Codex training data cutoff. They verify using StarCoder's dataset membership test that GHRB tests are not in the Stack training dataset."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": true,
    357         "justification": "Section 3.1 notes 'our specific template format makes it highly unlikely that prompts we generate exist verbatim within the LLM training data' and discusses that bug reports are only connected to BRTs via chains of references, partly mitigating leakage."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": true,
    362         "justification": "The paper acknowledges Defects4J is likely in training data (citing Lee et al.), creates GHRB from separate repositories post-cutoff, and verifies using StarCoder's dataset membership test that GHRB test code is not in the Stack dataset."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": true,
    367         "justification": "Two concrete methods are used: (1) StarCoder's dataset membership test (Section 6.4.2) to verify GHRB tests are not in training data, and (2) temporal splits creating GHRB from post-cutoff PRs."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "LIBRO using code-davinci-002 can reproduce 33.5% of bugs (251/750) in Defects4J v2.0.",
    374       "evidence": "Table 5 and Section 6.1.1 show 251 bugs reproduced out of 750 under the two-example n=50 setting.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "StarCoder achieves 70% of Codex's reproduction performance on Defects4J and 90% on GHRB.",
    379       "evidence": "Section 6.4.1 reports StarCoder reproducing 125/173 bugs (n=10, Defects4J). Section 6.4.2 reports StarCoder at 90% of Codex on GHRB (n=50).",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Bug reproduction performance increases as LLM size increases within the same model family.",
    384       "evidence": "Figure 8a shows increasing performance with model size for both CodeGen2 (1B→16B) and Incoder (1B→6B) families.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Fine-tuning code LLMs on natural language can hurt bug reproduction performance.",
    389       "evidence": "Section 6.4.1 shows StarCoderPlus (NL fine-tuned) performed 'substantially worse' than StarCoder, and BloomZ performed worse than Bloom.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Sampling temperature of 0.6 optimizes LIBRO-StarCoder performance.",
    394       "evidence": "Figure 8b and Section 6.4.5 show systematic evaluation across 7 temperature values (0 to 1.0), with T=0.6 yielding 127 reproduced bugs (best).",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "LIBRO's self-consistency-based selection and ranking strategy generalizes across different LLMs.",
    399       "evidence": "Table 10 and Figure 9 show the strategy consistently improves precision across all tested LLMs. The ROC-AUC is remarkably consistent (0.76-0.80) across temperatures (Section 6.5.2).",
    400       "supported": "strong"
    401     },
    402     {
    403       "claim": "Open-source LLMs can generate bug-reproducing tests for bugs not in their training data.",
    404       "evidence": "Section 6.4.2 shows StarCoder reproducing bugs from GHRB (post-cutoff dataset), and StarCoder's dataset membership test confirms the tests are not in the Stack training data.",
    405       "supported": "strong"
    406     },
    407     {
    408       "claim": "LIBRO substantially outperforms the crash reproduction baseline EvoCrash.",
    409       "evidence": "Figure 2 shows LIBRO reproduced 91 more unique bugs (19 being crash bugs) than EvoCrash, and can handle non-crash bugs that EvoCrash cannot.",
    410       "supported": "strong"
    411     }
    412   ],
    413   "key_findings": "LIBRO, an LLM-based pipeline for generating bug-reproducing tests from natural language bug reports, can reproduce about one-third (251/750) of Defects4J bugs using code-davinci-002. Among 15 evaluated LLMs, StarCoder (15B, open-source) achieves 70% of Codex's performance on Defects4J and 90% on a held-out dataset, demonstrating open-source LLM viability. The self-consistency-based selection and ranking mechanism generalizes across LLMs and temperature settings, with ROC-AUC of 0.76-0.80 for bug selection. Performance scales with LLM size within families, while fine-tuning on natural language hurts code reproduction capability.",
    414   "red_flags": [
    415     {
    416       "flag": "No statistical significance tests",
    417       "detail": "All LLM comparisons (15 models, multiple settings) are based on raw number comparisons without any statistical tests. Claims of 'best performance' and 'substantially worse' lack formal hypothesis testing."
    418     },
    419     {
    420       "flag": "Very small held-out dataset",
    421       "detail": "The GHRB dataset used to verify generalization contains only 31 bugs, which is quite small for drawing conclusions about model capabilities outside training data."
    422     },
    423     {
    424       "flag": "No limitations section",
    425       "detail": "The paper lacks a dedicated Limitations or Threats to Validity section, which is unusual for an empirical SE paper in IEEE TSE. Limitations are scattered across research question discussions."
    426     },
    427     {
    428       "flag": "No human evaluation of developer utility",
    429       "detail": "Despite framing LIBRO as reducing developer effort and measuring metrics like 'wasted effort,' no human evaluation of test quality, readability, or actual developer usefulness was conducted."
    430     }
    431   ],
    432   "cited_papers": [
    433     {
    434       "title": "Evaluating large language models trained on code",
    435       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    436       "year": 2021,
    437       "arxiv_id": "2107.03374",
    438       "relevance": "Foundational paper on Codex/HumanEval benchmark for LLM code generation, directly relevant to evaluating LLM coding capabilities."
    439     },
    440     {
    441       "title": "Language models are few-shot learners",
    442       "authors": ["T. Brown", "B. Mann", "N. Ryder"],
    443       "year": 2020,
    444       "relevance": "GPT-3 paper establishing few-shot prompting paradigm used in LIBRO's prompt engineering approach."
    445     },
    446     {
    447       "title": "Starcoder: may the source be with you!",
    448       "authors": ["R. Li", "L. B. Allal", "Y. Zi"],
    449       "year": 2023,
    450       "relevance": "Open-source code LLM that achieved best open-source performance in the LIBRO evaluation, relevant to open-source LLM capability assessment."
    451     },
    452     {
    453       "title": "Large language models are few-shot testers: Exploring llm-based general bug reproduction",
    454       "authors": ["S. Kang", "J. Yoon", "S. Yoo"],
    455       "year": 2023,
    456       "relevance": "Prior conference version of this work (ICSE '23), establishing LLM-based bug reproduction as a research area."
    457     },
    458     {
    459       "title": "CodaMOSA: Escaping coverage plateaus in test generation with pre-trained large language models",
    460       "authors": ["C. Lemieux", "J. P. Inala", "S. K. Lahiri", "S. Sen"],
    461       "year": 2023,
    462       "relevance": "Combines LLM generation with search-based testing, directly relevant to LLM-assisted test generation evaluation."
    463     },
    464     {
    465       "title": "Self-consistency improves chain of thought reasoning in language models",
    466       "authors": ["X. Wang", "J. Wei", "D. Schuurmans"],
    467       "year": 2023,
    468       "relevance": "Theoretical basis for LIBRO's selection and ranking algorithm; demonstrates self-consistency as a general LLM property."
    469     },
    470     {
    471       "title": "How is ChatGPT's behavior changing over time?",
    472       "authors": ["L. Chen", "M. Zaharia", "J. Zou"],
    473       "year": 2023,
    474       "relevance": "Documents LLM behavior drift over time, directly relevant to reproducibility concerns with API-based LLM evaluation."
    475     },
    476     {
    477       "title": "The GitHub recent bugs dataset for evaluating LLM-based debugging applications",
    478       "authors": ["J. Y. Lee", "S. Kang", "J. Yoon", "S. Yoo"],
    479       "year": 2023,
    480       "relevance": "Companion work analyzing data contamination in Defects4J for LLM evaluation, directly relevant to benchmark validity."
    481     },
    482     {
    483       "title": "Defects4j: A database of existing faults to enable controlled testing studies for Java programs",
    484       "authors": ["R. Just", "D. Jalali", "M. D. Ernst"],
    485       "year": 2014,
    486       "relevance": "Primary benchmark used for evaluation; widely used in software engineering research for bug-related tasks."
    487     },
    488     {
    489       "title": "Emergent abilities of large language models",
    490       "authors": ["J. Wei", "Y. Tay", "R. Bommasani"],
    491       "year": 2022,
    492       "relevance": "Referenced for the sudden emergence of bug reproduction capability at certain model sizes (CodeGen2-7B), relevant to understanding LLM scaling behavior."
    493     },
    494     {
    495       "title": "Impact of code language models on automated program repair",
    496       "authors": ["N. Jiang", "K. Liu", "T. Lutellier", "L. Tan"],
    497       "year": 2023,
    498       "relevance": "Evaluates open-source LLMs for SE tasks, relevant to comparative LLM capability assessment."
    499     },
    500     {
    501       "title": "Competition-level code generation with AlphaCode",
    502       "authors": ["Y. Li", "D. Choi", "J. Chung"],
    503       "year": 2022,
    504       "arxiv_id": "2203.07814",
    505       "relevance": "Uses test generation to boost code synthesis, relevant to understanding the relationship between test generation and code quality."
    506     }
    507   ]
    508 }

Impressum · Datenschutz