ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (35705B)


      1 {
      2   "paper": {
      3     "title": "Tracking the Moving Target: A Framework for Continuous Evaluation of LLM Test Generation in Industry",
      4     "authors": [
      5       "Maider Azanza",
      6       "Beatriz Pérez Lamancha",
      7       "Eneko Pizarro"
      8     ],
      9     "year": 2025,
     10     "venue": "International Conference on Evaluation & Assessment in Software Engineering (EASE 2025)",
     11     "arxiv_id": "2504.18985",
     12     "doi": "10.1145/3756681.3756946"
     13   },
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "A replication package is provided at Zenodo (https://zenodo.org/records/15274212), referenced in Section 5.2: 'The full reference to the methods can be found in the replication package.'"
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The replication package at Zenodo contains the test methods and related materials. The paper also states the final optimized prompt is available in the replication package (Section 5.4)."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Section 5.5 specifies the tooling infrastructure: JUnit 5, SonarQube/SonarCloud, JaCoCo, Maven for dependency management. 'All dependency management was handled through Maven, ensuring reproducible builds and consistent tool versions across evaluations.'"
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper describes the methodology (Section 5) but does not provide step-by-step reproduction instructions. The replication package is referenced but the paper itself contains no README-style commands or a 'Reproducing Results' section."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Table 5 reports only point estimates for all metrics (e.g., '39.14%' line coverage, '91.76%' total weight assessment). No confidence intervals, error bars, or ± notation appear anywhere in the results."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper compares 6 model configurations across 11 metrics (Table 5) and draws comparative conclusions ('dramatic improvements,' 'most immediate practical outcome') without any statistical significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Table 5 provides baseline context for understanding improvement magnitude — e.g., compilation errors from 31 to 0, line coverage from 39.14% to 98.00%, total weighted assessment from 32.44% to 91.76%. The reader can compute effect sizes from the provided numbers."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 5.2 justifies the sample: 'Due to the significant time requirements of manual expert evaluation needed for measuring subjective metrics, we limited our selection to seven functions. These were specifically chosen to cover a diverse range of typical industrial programming patterns and testing challenges.'"
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No standard deviation, variance, IQR, or spread measures are reported. Table 5 shows single aggregate values per model/metric with no indication of variability across the 7 test functions or across runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Multiple baselines are included: the initial GPT-4 evaluation (March 2024) serves as a temporal baseline, expert-written tests serve as a quality baseline (Expert-generated Test Coverage metric), and 6 model configurations are compared against each other in Table 5."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The December 2024 evaluation includes contemporary models: GPT-o, o1-Preview, o1-Mini, and Claude 3.5 Sonnet. These were among the most capable models available at evaluation time."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "The paper presents a measurement framework (an evaluation instrument), not a multi-component system that can be ablated. The LLMs being evaluated are third-party black boxes whose components cannot be removed."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The framework uses 11 metrics across 3 categories: code quality metrics (compilation errors, static analysis issues, setup/teardown usage), white box metrics (line/branch/decision coverage, test isolation), and black box metrics (equivalence partitioning, boundary value analysis, parameterization, expert coverage)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Black box metrics (Section 4.3) require expert judgment: 'Unlike code quality and white box metrics that can be automatically computed, these metrics require expert judgment to evaluate test design decisions and methodology.' Expert QA engineers evaluated equivalence partitioning, boundary value analysis, parameterization, and expert-generated test coverage."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The same 7 functions used for iterative prompt refinement (Section 5.3-5.4) are used for final evaluation. There is no separation between development and test sets — prompt engineering was tuned on the same examples used to report results."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 5 provides per-metric breakdowns across all 11 metrics and 6 model configurations, with separate sections for code quality, white box, and black box metrics."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses early failures: 'In March 2024, GitHub Copilot generated tests that largely failed to compile and required significant developer intervention to be usable.' Section 5.4 details specific issues like compilation errors, static analysis warnings, and low coverage that prompted prompt refinement."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The March 2024 results (32.44% total score, 31 compilation errors) are effectively negative results. The o1-Mini model performed poorly (63.81% total, 28.57% on coverage metrics) compared to peers, and this is reported without being hidden."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims the framework 'integrates with industry-standard tools like SonarQube' (supported by Section 4-5), 'provides metrics that evaluate both technical adequacy and practical considerations' (supported by Tables 1-3), and 'results highlight the rapid evolution of LLM capabilities' (supported by Table 5's longitudinal comparison)."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "Section 7 claims 'proper prompt engineering significantly impacts performance,' and the March→May comparison implies prompt refinement caused improvement. However, Section 8 acknowledges: 'The observed improvements in LLM performance over time could be influenced by factors other than actual evolution in LLM capabilities. In particular, our iterative refinement of prompting strategies might confound the results.' The confound is acknowledged but not resolved."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title says 'in Industry' (general) and the abstract offers 'practical guidance for companies seeking to integrate these technologies' (plural, general). However, results are from 7 Java functions at a single company (LKS Next). While Section 8 acknowledges the single-organization limitation, the title and abstract frame the results more broadly than the evidence supports."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 8 (Internal Validity) discusses a specific alternative explanation: 'The observed improvements in LLM performance over time could be influenced by factors other than actual evolution in LLM capabilities. In particular, our iterative refinement of prompting strategies might confound the results.' They describe mitigation measures including consistent base prompts and documented modifications."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper explicitly acknowledges the gap between metrics and quality: 'Coverage vs. effectiveness gap: Achieving high coverage does not necessarily guarantee meaningful tests' (Section 2.3). The dual objective/subjective metric design (Section 4) is itself a response to the proxy-outcome gap, recognizing that automated metrics alone don't capture test quality."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Models are referenced by marketing names only: 'GPT-4,' 'GPT-o,' 'o1-Preview,' 'o1-Mini,' 'Claude 3.5 Sonnet.' No specific version strings (e.g., 'gpt-4-0613'), snapshot dates, or API versions are provided. Model behavior changes across versions, making these designations insufficient for reproduction."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Section 5.4 includes substantial quoted prompt fragments (e.g., 'When creating the test class, do not generate any line of code that could create any Issue, nor Bug nor Code Smell'). The paper states 'The final optimized prompt (available in our replication package)' at the Zenodo URL, providing the full prompt text."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No LLM hyperparameters are reported — no temperature, top-p, max tokens, or other API settings. The paper uses commercial LLM tools (GitHub Copilot, ChatGPT) without stating their sampling parameters, which significantly affect output."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "The paper evaluates GitHub Copilot and ChatGPT as third-party black-box tools. Per the schema, papers evaluating third-party tools as black boxes are NA — the authors cannot be expected to describe internal scaffolding they have no access to."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 5.2 documents the test case selection process in detail, explaining criteria (avoiding data leakage, representing common industrial patterns, compatibility with unit and integration testing) and listing each of the 7 functions with specific rationale for inclusion."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 8 'Threats to Validity' provides substantive discussion organized across four dimensions: construct validity, internal validity, external validity, and reliability, each with specific threats and mitigation strategies."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 8 discusses threats specific to this study: 'our iterative refinement of prompting strategies might confound the results' (internal validity), 'primarily based on experiences from a single organization and specific types of software projects' (external validity), 'expert assessment component inherently involves subjective judgment' (reliability)."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper states generic limitations ('may not generalize to organizations with different development practices, project types, or quality requirements') but does not state specific exclusions — e.g., it doesn't say 'we did not test languages other than Java,' 'we did not evaluate system-level testing,' or 'our results do not apply to non-enterprise codebases.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "A replication package is available at Zenodo (https://zenodo.org/records/15274212), referenced in Section 5.2. The paper states the methods and prompt are available there for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 5 describes the full evaluation workflow: environment setup (5.1), test case selection with specific criteria (5.2), iterative evaluation process (5.3), prompt engineering methodology (5.4), and tooling infrastructure (5.5). The study period (March 2024 to December 2024) is specified."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "Expert evaluators are identified only as 'expert Quality Assurance engineers from LKS Next' and 'a QA team leader.' The paper does not describe how these experts were selected, their qualifications, experience level, or whether the selection could introduce bias in subjective metric evaluation."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Figure 1 shows the BPMN diagram of the iterative measurement process, and Section 5 documents each stage: environment preparation → test case selection and ground truth → iterative evaluation (generate → analyze → compare → refine/document). The pipeline from generation to final scoring is traceable."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The Acknowledgments section lists specific funding: MCIN/AEI/10.13039/501100011033 contract, EU NextGeneration EU/PRTR under PID2021-125438OB-I00, University of the Basque Country 'University-Enterprise-Society' program (US24/10), and grant GIU21/037."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: Maider Azanza and Eneko Pizarro from University of the Basque Country (UPV/EHU), Beatriz Pérez Lamancha from LKS Next. The dual academic-industry affiliation is transparent."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Funding comes from government/EU grants (MCIN/AEI, EU NextGeneration) and university programs. These funders have no financial stake in whether GitHub Copilot or other LLMs perform well at test generation."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or conflicts-of-interest statement is provided. One author is employed by LKS Next, which is the company evaluating tools for potential adoption — this creates a potential interest in finding the tools viable, but no declaration addresses this."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates are stated for any of the evaluated models (GPT-4, GPT-o, o1-Preview, o1-Mini, Claude 3.5 Sonnet). The paper discusses contamination risk in general but does not specify when each model's training data was collected."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Section 5.2 explicitly addresses this: 'Our initial explorations confirmed previous findings that LLMs trained on public repositories often reproduce existing tests rather than generating novel ones, especially when encountering familiar code. To ensure our evaluation measured true generative ability, we deliberately selected functions and classes from projects that did not have pre-existing, publicly accessible test suites.'"
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "The contamination risk is mitigated by design: the evaluation uses private industrial code from LKS Next that would not be in LLM training data. Section 5.2 describes selecting 'functions and classes from projects that did not have pre-existing, publicly accessible test suites.'"
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants were studied. The expert evaluators are part of the research methodology (assessors), not study subjects."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants were studied. The paper evaluates LLM-generated tests using automated tools and expert assessment, not human subjects."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants were studied. Expert evaluators are part of the research team, not study participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants were studied."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants were studied. This is an evaluation of LLM tools, not an experimental study with human subjects."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants were studied."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants were studied."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "Section 7 mentions 'higher computational costs and latency' for newer models and 'per-seat licensing' for GitHub Copilot, but no actual cost figures are reported — no API costs, tokens consumed, or cost per test suite generated."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No computational budget is stated. The paper does not report GPU hours, API spend, evaluation wall-clock time, or hardware used for the evaluation."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "LLM outputs are non-deterministic, but the paper reports no analysis of variability across runs. Table 5 shows single values per model/metric with no indication of run-to-run variance."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of runs per model configuration is not stated. Section 8 mentions 'validate significant changes in performance through multiple test runs' as a mitigation strategy but provides no specific numbers."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The prompt engineering process is described as iterative (Section 5.4) but no search budget is reported — the number of prompt iterations tried, total configurations explored, or compute spent on prompt refinement is not stated."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The iterative prompt refinement process (Section 5.3-5.4) describes refining until 'quality thresholds are met,' but doesn't report all configurations tried or how the final prompt was selected over alternatives."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The paper compares 6 model configurations across 11 metrics (66 implicit comparisons) without any statistical tests, let alone corrections for multiple comparisons."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors designed the evaluation framework and then used it to evaluate LLMs. They do not acknowledge the inherent bias of evaluating tools using their own framework — the metric weights and selection naturally favor the dimensions they chose to measure."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Section 7 acknowledges 'newer models like Claude 3.5 Sonnet and o1-Preview show impressive performance' but 'come with higher computational costs and latency.' However, no quantitative performance-per-compute comparison is provided."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Section 2.2-2.3 discusses construct validity: 'Coverage vs. effectiveness gap: Achieving high coverage does not necessarily guarantee meaningful tests.' The framework explicitly addresses this by including both automated metrics and expert assessment, questioning whether coverage alone measures test quality."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper is unclear about whether all models were evaluated through the same interface (GitHub Copilot) or some via direct ChatGPT access. Table 5 headers ('ChatGPT-4' vs model names) suggest mixed interfaces. The scaffold/interface confound is not discussed."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "Temporal leakage is mitigated by design. Section 5.2: 'we deliberately selected functions and classes from projects that did not have pre-existing, publicly accessible test suites,' meaning the test targets would not appear in any model's training data regardless of cutoff date."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The prompt provides extensive context: full source code, all interconnected classes, JavaDoc, Maven dependencies, framework documentation links, and Mockito usage examples (Section 5.4). No discussion of whether this rich context represents realistic usage or gives the LLM information beyond what a typical developer would provide."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The 7 test functions appear to come from different projects and domains (rental management, user management, prime checker, etc.), but the paper does not explicitly discuss independence between test cases or whether shared company conventions could create dependencies."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "Section 5.2 describes a concrete prevention method: 'Our initial explorations confirmed previous findings that LLMs trained on public repositories often reproduce existing tests rather than generating novel ones.' They then used private industrial code without public test suites as a mitigation strategy."
    359       }
    360     },
    361     "survey_methodology": {
    362       "prisma_or_structured_protocol": {
    363         "applies": false,
    364         "answer": false,
    365         "justification": "This is not a survey or systematic review paper. It is an industrial case study evaluating LLM test generators."
    366       },
    367       "quality_assessment_of_sources": {
    368         "applies": false,
    369         "answer": false,
    370         "justification": "This is not a survey or systematic review paper."
    371       },
    372       "publication_bias_discussed": {
    373         "applies": false,
    374         "answer": false,
    375         "justification": "This is not a survey or systematic review paper."
    376       }
    377     }
    378   },
    379   "scan_version": 3,
    380   "active_modules": [
    381     "experimental_rigor",
    382     "data_leakage"
    383   ],
    384   "claims": [
    385     {
    386       "claim": "LLM test generation capabilities improved dramatically from March 2024 to December 2024, with weighted assessment rising from 32.44% to over 91%.",
    387       "evidence": "Table 5 shows progression: ChatGPT-4 (March 2024) scored 32.44%, iterative GPT-4 (May 2024) 67.96%, o1-Preview (December 2024) 91.76%, Claude 3.5 Sonnet 90.72%.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Iterative prompt engineering significantly improves test generation quality within the same model.",
    392       "evidence": "Table 5: GPT-4 improved from 32.44% (first time) to 67.96% (iterative) using the same model with refined prompts. Section 5.4 details the prompt refinement process.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "o1-Preview and Claude 3.5 Sonnet achieve near-perfect compilation with 0 errors and line coverage above 95%.",
    397       "evidence": "Table 5: o1-Preview has 0 compilation errors and 98.00% line coverage; Claude 3.5 Sonnet has 0 compilation errors and 95.71% line coverage.",
    398       "supported": "weak"
    399     },
    400     {
    401       "claim": "Test parameterization improved from 12.70% to 88-91% across the study period.",
    402       "evidence": "Table 5 shows parameterization metric going from 12.70% (March 2024 GPT-4) to 91.84% (o1-Preview) and 88.89% (Claude 3.5 Sonnet).",
    403       "supported": "moderate"
    404     },
    405     {
    406       "claim": "English prompts produce significantly better results compared to Spanish prompts.",
    407       "evidence": "Section 5.4 states 'Our analysis revealed significantly better results with English prompts compared to Spanish' but provides no quantitative comparison data.",
    408       "supported": "unsupported"
    409     },
    410     {
    411       "claim": "Expert oversight remains essential despite high automated scores from newer models.",
    412       "evidence": "Section 7 discusses this qualitatively, noting black box metrics show LLMs 'don't yet match human expertise in test design decisions.' The gap between automated and expert test coverage metrics provides some evidence.",
    413       "supported": "moderate"
    414     }
    415   ],
    416   "methodology_tags": [
    417     "case-study",
    418     "benchmark-eval"
    419   ],
    420   "key_findings": "This paper presents a measurement framework for continuously evaluating LLM-based test generators in industrial settings, validated through a longitudinal study at LKS Next using 7 Java functions. Results show dramatic improvement from March to December 2024: GPT-4 initially scored 32.44% on a weighted assessment combining 11 code quality, coverage, and testing design metrics, while newer models (o1-Preview, Claude 3.5 Sonnet) achieved over 90%. The study demonstrates the importance of prompt engineering (same model improved from 32% to 68% through prompt refinement) and highlights that expert oversight remains necessary despite high automated metric scores.",
    421   "red_flags": [
    422     {
    423       "flag": "Tiny sample size",
    424       "detail": "Only 7 Java functions were evaluated. This is an extremely small sample for drawing conclusions about LLM test generation capability in industry. The paper acknowledges the limitation (resource constraints of manual expert evaluation) but the sample is too small for the generality of claims made."
    425     },
    426     {
    427       "flag": "Single organization study with broad claims",
    428       "detail": "All evaluation was conducted at LKS Next, a single mid-sized consultancy. The title frames this as 'in Industry' generally, and the abstract offers 'practical guidance for companies,' but the evidence base is one company with one technology stack (Java, Maven, SonarQube)."
    429     },
    430     {
    431       "flag": "Prompt engineering confounds temporal comparison",
    432       "detail": "The March→May GPT-4 improvement (32.44%→67.96%) conflates model improvement with prompt refinement. The December evaluation uses the refined prompt on newer models, making it impossible to isolate how much improvement is due to better models vs. better prompts. Section 8 acknowledges this but doesn't resolve it."
    433     },
    434     {
    435       "flag": "No statistical analysis on any comparison",
    436       "detail": "All results are point estimates from a single evaluation per model configuration across 7 functions. No error bars, confidence intervals, significance tests, or variance measures are reported despite making comparative claims about model performance."
    437     },
    438     {
    439       "flag": "Company evaluating tools for own adoption",
    440       "detail": "One author (Beatriz Pérez Lamancha) is from LKS Next, the company evaluating LLM tools for potential adoption. This creates a subtle incentive bias — the company benefits from finding the tools viable. The paper reports increasingly positive results culminating in the company 'reconsidering' adoption."
    441     },
    442     {
    443       "flag": "No held-out evaluation",
    444       "detail": "The same 7 functions were used both for iterative prompt refinement and for final evaluation. There is no held-out test set, meaning the reported scores may be inflated by prompts tuned specifically for these examples."
    445     }
    446   ],
    447   "cited_papers": [
    448     {
    449       "title": "Software testing with large language models: Survey, landscape, and vision",
    450       "authors": ["J. Wang", "Y. Huang", "C. Chen", "Z. Liu", "S. Wang", "Q. Wang"],
    451       "year": 2024,
    452       "relevance": "Comprehensive survey of 102 papers on LLM-based software testing, identifying key challenges including coverage, test oracle problem, and real-world application."
    453     },
    454     {
    455       "title": "Using large language models to generate junit tests: An empirical study",
    456       "authors": ["M. Siddiq", "J. Da Silva", "R. Tanvir", "N. Ulfat", "F. Al Rifat", "V. Carvalho"],
    457       "year": 2024,
    458       "relevance": "Empirical evaluation of LLM unit test generation (Codex, GPT-3.5-Turbo, StarCoder) highlighting challenges with coverage, compilability, and reliability."
    459     },
    460     {
    461       "title": "ChatGPT vs SBST: A comparative assessment of unit test suite generation",
    462       "authors": ["Y. Tang", "Z. Liu", "Z. Zhou", "X. Luo"],
    463       "year": 2024,
    464       "doi": "10.1109/TSE.2024.3382365",
    465       "relevance": "Compares ChatGPT-generated tests against search-based software testing tools, finding SBST achieves higher coverage but LLM tests show better readability and assertion quality."
    466     },
    467     {
    468       "title": "ChatUniTest: A framework for LLM-based test generation",
    469       "authors": ["Y. Chen", "Z. Hu", "C. Zhi", "J. Han", "S. Deng", "J. Yin"],
    470       "year": 2024,
    471       "relevance": "Framework addressing LLM test generation limitations through adaptive focal context and generation-validation-repair mechanism."
    472     },
    473     {
    474       "title": "An empirical evaluation of using large language models for automated unit test generation",
    475       "authors": ["M. Schäfer", "S. Nadi", "A. Eghbali", "F. Tip"],
    476       "year": 2024,
    477       "doi": "10.1109/TSE.2023.3334955",
    478       "relevance": "TESTPILOT system for JavaScript test generation using LLMs, demonstrating importance of prompt engineering and iterative refinement for test quality."
    479     },
    480     {
    481       "title": "AutoTestGPT: A system for the automated generation of software test cases based on ChatGPT",
    482       "authors": ["H. Liu", "L. Liu", "C. Yue", "Y. Wang", "B. Deng"],
    483       "year": 2024,
    484       "relevance": "System using ChatGPT for automated test case generation, achieving 70% reduction in framework generation time vs. manual approaches."
    485     },
    486     {
    487       "title": "Large language models for software engineering: A systematic literature review",
    488       "authors": ["X. Hou", "Y. Zhao", "Y. Liu", "Z. Yang", "K. Wang", "L. Li", "X. Luo", "D. Lo", "J. Grundy", "H. Wang"],
    489       "year": 2024,
    490       "relevance": "Comprehensive systematic literature review of LLMs applied to software engineering tasks."
    491     },
    492     {
    493       "title": "The current challenges of software engineering in the era of large language models",
    494       "authors": ["C. Gao", "X. Hu", "S. Gao", "X. Xia", "Z. Jin"],
    495       "year": 2025,
    496       "relevance": "Examines current challenges that LLMs pose for software engineering practices and research."
    497     },
    498     {
    499       "title": "Using AI-based coding assistants in practice: State of affairs, perceptions, and ways forward",
    500       "authors": ["A. Sergeyuk", "Y. Golubev", "T. Bryksin", "I. Ahmed"],
    501       "year": 2025,
    502       "relevance": "Survey on practical use of AI coding assistants, finding test writing is one of the tasks developers most want to delegate to AI."
    503     },
    504     {
    505       "title": "On inter-dataset code duplication and data leakage in large language models",
    506       "authors": ["J. A. H. López", "B. Chen", "M. Saad", "T. Sharma", "D. Varró"],
    507       "year": 2025,
    508       "relevance": "Addresses data leakage and code duplication concerns when evaluating LLMs, directly relevant to benchmark contamination assessment."
    509     },
    510     {
    511       "title": "ConDefects: A complementary dataset to address the data leakage concern for LLM-based fault localization and program repair",
    512       "authors": ["Y. Wu", "Z. Li", "J. M. Zhang", "Y. Liu"],
    513       "year": 2024,
    514       "relevance": "Dataset designed to mitigate data leakage in LLM evaluation for fault localization and program repair."
    515     }
    516   ],
    517   "engagement_factors": {
    518     "practical_relevance": {
    519       "score": 3,
    520       "justification": "The measurement framework is directly applicable for companies evaluating LLM test generation tools in their development pipelines."
    521     },
    522     "surprise_contrarian": {
    523       "score": 1,
    524       "justification": "The finding that LLM test generation improved from 32% to 91% in 9 months is noteworthy but follows expected improvement trends rather than challenging conventional wisdom."
    525     },
    526     "fear_safety": {
    527       "score": 0,
    528       "justification": "No AI safety or security concerns are raised; the paper focuses on tool evaluation methodology."
    529     },
    530     "drama_conflict": {
    531       "score": 0,
    532       "justification": "No controversy or conflict; the paper presents a constructive framework without challenging any specific claims or organizations."
    533     },
    534     "demo_ability": {
    535       "score": 1,
    536       "justification": "A replication package is available at Zenodo, but the framework is a methodology/process rather than an installable tool."
    537     },
    538     "brand_recognition": {
    539       "score": 2,
    540       "justification": "The study evaluates GitHub Copilot (widely known), GPT-4, and Claude 3.5 Sonnet — recognizable products — though the authors and company (LKS Next) are not widely known."
    541     }
    542   }
    543 }

Impressum · Datenschutz