scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (33789B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "EvoGPT: Leveraging LLM-Driven Seed Diversity to Improve Search-Based Test Suite Generation",
      6     "authors": [
      7       "Lior Broide",
      8       "Roni Stern",
      9       "Argaman Mordoch"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv",
     13     "arxiv_id": "2505.12424",
     14     "doi": null
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The abstract claims '10% improvement in both code coverage and mutation score.' Table III shows LCCT: 92 vs 83/83 (~9%), BCCT: 90 vs 79/80 (~10-11%), MSCT: 87 vs 69/78 (~9-18%). The claim of ~10% average improvement is supported.",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Causal claims about component contributions are made via ablation studies (Table VI), which use controlled single-variable manipulation — each row adds one component. This is an adequate design for causal attribution in this context.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The paper explicitly bounds results to Defects4J and Java. Section IV-F (external validity) states: 'Our experiments are limited to Defects4J projects only. Although these span a range of domains, they may not fully represent the complexity of industrial code bases.' Future work mentions extending to 'additional programming languages.'",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Section IV-F discusses specific alternative explanations: PITest may not detect equivalent mutants (falsely lowering mutation score), JaCoCo may miss paths due to bytecode instrumentation quirks, and LLM stochasticity means results may vary across runs.",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "MSCT is explicitly described as 'a commonly used proxy for fault detection capability' (Section II). In construct validity, the authors acknowledge they 'did not measure other qualitative aspects such as readability, assertion relevance, or developer trust,' distinguishing automated proxy metrics from broader test quality.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section IV-E 'Limitations' discusses runtime, monetary cost, and evolutionary budget vs. wall-clock time. Section IV-F 'Threats to validity' covers internal, construct, and external validity.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Specific threats include: PITest may not detect equivalent mutants; JaCoCo may miss paths due to bytecode instrumentation; LLM stochasticity means results vary across runs; Defects4J-only evaluation; focal methods are public methods only; exact reproducibility depends on specific API configurations.",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Section IV-F explicitly states: experiments limited to Defects4J, may not represent industrial codebases; focal methods are public methods which 'may not capture the full interaction behavior in production scenarios'; and reproducibility depends on LLM access. Future work lists extending to 'additional programming languages.'",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding, acknowledgments, or grant information is mentioned anywhere in the paper.",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors list their affiliation as Ben-Gurion University of the Negev, Faculty of Computer and Information Science. No commercial product is being evaluated.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No funding is disclosed; appears to be unfunded academic research with no commercial interest in the outcome.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial disclosure statement is present in the paper.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Section II defines SBST, evolutionary algorithms, LLM-based test generation, and all three evaluation metrics (LCCT, BCCT, MSCT) with precise formulations.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Three explicit contributions are bulleted at the end of the introduction: the hybrid system, the diversity-inducing configuration, and the empirical evaluation.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Sections II-C and II-D engage substantively with CodaMosa, TestART, ChatUniTest, pytLMtester, and SearchSYS, explaining precisely how EvoGPT differs (multi-configuration diversity vs single LLM invocation).",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "Section IV-F (Reproducibility) states: 'Our code, data, and scripts are available at https://tinyurl.com/EvoGPT.'",
    123           "source": "opus"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Defects4J is a publicly available benchmark. The authors additionally claim their data is available at their repository URL.",
    129           "source": "opus"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "The paper mentions JaCoCo 0.8.12, PITest 1.19.0, and gpt-4o-mini, but does not provide a comprehensive environment specification (no requirements.txt, Dockerfile, Java version, or dependency list). Tool versions are scattered across the text rather than in a dedicated environment section.",
    135           "source": "opus"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "The paper provides a URL to the code repository but does not include step-by-step reproduction instructions in the paper itself. No 'Reproducing Results' section or specific commands are documented.",
    141           "source": "opus"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "All results in Tables III, V, and VI are reported as point estimates without confidence intervals, error bars, or ± notation.",
    149           "source": "opus"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Table IV reports Wilcoxon signed-rank tests comparing EvoGPT against both baselines across all three metrics, with p-values (all < 0.001).",
    155           "source": "opus"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Table IV reports Cliff's delta (δ) as an effect size measure for all comparisons, ranging from 0.75 to 0.98, with interpretation guidelines provided (|δ| ≥ 0.474 = large effect).",
    161           "source": "opus"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The evaluation uses all 17 Defects4J projects, but no power analysis or justification is given for whether n=17 is sufficient for the statistical claims being made.",
    167           "source": "opus"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No standard deviations, variance, or spread measures across experimental runs are reported. The authors explicitly acknowledge in Section IV-F (construct validity) that 'future work could analyze variance across seeds to better characterize the stability of the generated test suites.'",
    173           "source": "opus"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Two baselines are included: EvoSuite (EA-based SBST tool) and TestART (LLM-based test generation). Both are compared across all metrics and projects.",
    181           "source": "opus"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "TestART (2024) is a recent LLM-based approach shown to outperform ChatUniTest. EvoSuite is the standard SBST baseline still widely used. The paper justifies excluding CodaMosa, pytLMtester (Python-specific), and SearchSYS (system-level) as inapplicable to their Java unit-test setting.",
    187           "source": "opus"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Table VI presents a comprehensive ablation study with 6 configurations: LLM-only, +EA, +Temperature diversity, +Prompt diversity, +Plateau recovery, and Full EvoGPT, measuring the incremental contribution of each component.",
    193           "source": "opus"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Three metrics are used: Line Coverage of Correct Tests (LCCT), Branch Coverage of Correct Tests (BCCT), and Mutation Score of Correct Tests (MSCT).",
    199           "source": "opus"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "No human evaluation is conducted. The authors acknowledge in construct validity (Section IV-F) that they 'did not measure other qualitative aspects such as readability, assertion relevance, or developer trust.'",
    205           "source": "opus"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": false,
    210           "justification": "No separation between tuning and evaluation data. Fitness weights were chosen from 'preliminary experiments' (Section III-B1) and the population size was tuned (Table V), but results are reported on the same Defects4J projects without a held-out split.",
    211           "source": "opus"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Table III provides per-project breakdowns for all 17 Defects4J projects across all three metrics, not just aggregate averages.",
    217           "source": "opus"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": false,
    222           "justification": "No qualitative failure analysis is provided. There is no discussion of specific classes or methods where EvoGPT fails, no error analysis, and no examples of failure modes.",
    223           "source": "opus"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "The ablation study (Table VI) shows that adding a standard EA to LLM-generated tests provides only a 'limited increase in performance' (+1.5% LCCT, +1.3% BCCT, +0.7% MSCT). Table V shows diminishing returns beyond population size 25.",
    229           "source": "opus"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "The paper states 'gpt-4o-mini' without a snapshot date or API version. Section IV-F acknowledges: 'the LLM we used (gpt-4o-mini) is periodically updated by OpenAI,' indicating the model version is not pinned.",
    237           "source": "opus"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "Table I shows prompt objectives (summaries of instructions), not the full prompt text. The paper states 'The exact system prompts used for each LLM agent are included in the provided code' — the actual prompts are deferred to the code repository rather than provided in the paper.",
    243           "source": "opus"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Temperature values (0.3, 0.4, 0.5, 0.6, 0.8), population size (25), evolutionary budget (25 generations), crossover probability (0.8), stagnation threshold (τ=5), fitness improvement threshold (α=0.5), max plateau escapes (k=3), and mutation probability (1/Ntests) are all reported.",
    249           "source": "opus"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Section III describes the full pipeline in detail: source code preprocessing, 5 LLM agent configurations, generation-repair loop (4 max iterations), coverage enhancement with JaCoCo feedback, EA with ranked selection/crossover/mutation, plateau detection and LLM injection. Algorithm 1 provides pseudo-code. Figure 1 provides a system diagram.",
    255           "source": "opus"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Section III-A states: 'we remove excessive or lengthy inline comments, documentation blocks, and unreachable code segments that could overload the model's context window.' Focal method extraction from public classes is also described.",
    261           "source": "opus"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "The paper states 'Our code, data, and scripts are available at https://tinyurl.com/EvoGPT,' suggesting raw experimental data is accessible alongside the code.",
    269           "source": "opus"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Table II describes all 17 Defects4J projects with identifiers, version numbers, and focal method counts. The paper explains that focal methods are 'the public methods of each public class under test.' Defects4J is a well-documented standard benchmark.",
    275           "source": "opus"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants. Data source is the standard Defects4J benchmark.",
    281           "source": "opus"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": false,
    286           "justification": "While the test generation pipeline is described, the data pipeline from generation to evaluation lacks specifics: how many tests were generated, how many were removed by the repair loop, how many passed compilation and execution. No filtering counts or attrition figures are reported.",
    287           "source": "opus"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "The paper uses gpt-4o-mini to generate tests for Defects4J Java projects but does not state the model's training data cutoff date. Since the LLM may have seen the source code during training, this is relevant.",
    295           "source": "opus"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "Defects4J projects (e.g., Commons-Lang, Gson, JFreeChart) are well-known open-source projects likely in gpt-4o-mini's training data. No discussion of whether the model has seen the code under test, which could inflate test quality metrics.",
    301           "source": "opus"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "Defects4J was published in 2014 and its constituent projects are widely available on GitHub. gpt-4o-mini has almost certainly been trained on this code. The paper does not address this contamination risk despite it potentially inflating all reported metrics.",
    307           "source": "opus"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants in this study.",
    315           "source": "opus"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants in this study.",
    321           "source": "opus"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants in this study.",
    327           "source": "opus"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants in this study.",
    333           "source": "opus"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants in this study.",
    339           "source": "opus"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants in this study.",
    345           "source": "opus"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants in this study.",
    351           "source": "opus"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": true,
    358           "justification": "Table VII reports monetary cost per class: EvoGPT $0.32, TestART $0.01, EvoSuite $0.00. Average runtime per class is also reported: EvoGPT 8 min, TestART 2 min, EvoSuite 1 min.",
    359           "source": "opus"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": true,
    364           "justification": "Table VII reports average runtime per class (8 minutes for EvoGPT). The evolutionary budget is stated as population=25, generations=25. Per-class API costs are provided. While total API spend across all experiments is not given, the per-class figures allow estimation.",
    365           "source": "opus"
    366         }
    367       },
    368       "experimental_rigor": {
    369         "seed_sensitivity_reported": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "No multi-seed analysis is reported. The authors explicitly acknowledge in construct validity: 'future work could analyze variance across seeds to better characterize the stability of the generated test suites.'",
    373           "source": "opus"
    374         },
    375         "number_of_runs_stated": {
    376           "applies": true,
    377           "answer": false,
    378           "justification": "The paper does not state how many experimental runs produced the reported results. Results appear to be from single runs.",
    379           "source": "opus"
    380         },
    381         "hyperparameter_search_budget": {
    382           "applies": true,
    383           "answer": false,
    384           "justification": "Section III-B1 mentions 'In preliminary experiments, we evaluated several alternative weight configurations' for fitness weights, but does not report how many configurations were tried or the search method used.",
    385           "source": "opus"
    386         },
    387         "best_config_selection_justified": {
    388           "applies": true,
    389           "answer": false,
    390           "justification": "The fitness weight configuration was selected because it 'provided the most stable convergence behavior and best structural and mutation-based metrics,' but no details on how many alternatives were compared or on what data the selection was made.",
    391           "source": "opus"
    392         },
    393         "multiple_comparison_correction": {
    394           "applies": true,
    395           "answer": false,
    396           "justification": "Six Wilcoxon signed-rank tests are reported (3 metrics × 2 baselines) without any correction for multiple comparisons (e.g., Bonferroni, Holm).",
    397           "source": "opus"
    398         },
    399         "self_comparison_bias_addressed": {
    400           "applies": true,
    401           "answer": false,
    402           "justification": "The authors compare EvoGPT against EvoSuite and TestART without acknowledging the bias of evaluating their own system. While EvoSuite is used as-is (default configuration), the comparison inherently favors the authors' tuned system.",
    403           "source": "opus"
    404         },
    405         "compute_budget_vs_performance": {
    406           "applies": true,
    407           "answer": false,
    408           "justification": "EvoGPT takes 8 min/class vs 1-2 min for baselines, and $0.32 vs $0.00-$0.01. While these differences are reported (Table VII), no performance-as-a-function-of-compute analysis is provided. The evolutionary budget is matched but total compute is not.",
    409           "source": "opus"
    410         },
    411         "benchmark_construct_validity": {
    412           "applies": true,
    413           "answer": false,
    414           "justification": "The paper does not discuss whether Defects4J adequately represents real-world testing needs. No analysis of whether LCCT/BCCT/MSCT on Defects4J translates to actual bug-finding capability in production code.",
    415           "source": "opus"
    416         },
    417         "scaffold_confound_addressed": {
    418           "applies": true,
    419           "answer": true,
    420           "justification": "EvoGPT and TestART both use gpt-4o-mini, holding the model constant while comparing scaffolding approaches. The comparison is explicitly about the scaffold/approach, not the underlying model. EvoSuite uses no LLM, serving as an SBST-only baseline.",
    421           "source": "opus"
    422         }
    423       },
    424       "data_leakage": {
    425         "temporal_leakage_addressed": {
    426           "applies": true,
    427           "answer": false,
    428           "justification": "Defects4J projects have been publicly available since 2014. gpt-4o-mini was likely trained on data including these projects' source code and existing test suites. This temporal overlap is not discussed.",
    429           "source": "opus"
    430         },
    431         "feature_leakage_addressed": {
    432           "applies": true,
    433           "answer": false,
    434           "justification": "The LLM receives source code as input to generate tests. If the model memorized existing tests for these well-known projects during training, it could reproduce rather than generate novel tests. This is not discussed.",
    435           "source": "opus"
    436         },
    437         "non_independence_addressed": {
    438           "applies": true,
    439           "answer": false,
    440           "justification": "No discussion of whether Defects4J code appears in gpt-4o-mini's training set, or whether the model's pre-existing knowledge of these open-source projects provides an unfair advantage.",
    441           "source": "opus"
    442         },
    443         "leakage_detection_method": {
    444           "applies": true,
    445           "answer": false,
    446           "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, or decontamination analysis.",
    447           "source": "opus"
    448         }
    449       }
    450     }
    451   },
    452   "claims": [
    453     {
    454       "claim": "EvoGPT achieves on average ~10% improvement in code coverage and mutation score compared to both TestART and EvoSuite",
    455       "evidence": "Table III: EvoGPT LCCT=92, BCCT=90, MSCT=87 vs TestART (83/80/78) and EvoSuite (83/79/69). Table IV: all Wilcoxon tests p<0.001, Cliff's δ≥0.75.",
    456       "supported": "strong"
    457     },
    458     {
    459       "claim": "Explicitly enforcing diversity through multiple prompt strategies and temperature settings is key to LLM-SBST hybridization effectiveness",
    460       "evidence": "Table VI ablation: +EA alone adds only +1.5pp LCCT over LLM-only; adding temperature diversity adds 1.3pp more; prompt diversity another 0.6pp; combined full system achieves 8.6pp total gain.",
    461       "supported": "strong"
    462     },
    463     {
    464       "claim": "Naive integration of LLM-generated tests with evolutionary algorithms provides only minimal improvement over LLM-only baseline",
    465       "evidence": "Table VI: +EA configuration improves LCCT from 83.4% to 84.9% (+1.5pp) and MSCT from 80.1% to 80.8% (+0.7pp) — negligible without diversity.",
    466       "supported": "strong"
    467     },
    468     {
    469       "claim": "The plateau escape mechanism substantially improves performance by enabling the EA to escape local optima",
    470       "evidence": "Table VI: +Plateau recovery improves LCCT from 86.8% to 90.0% (+3.2pp) and MSCT from 82.0% to 85.0% (+3.0pp) — the largest single-component gain in the ablation.",
    471       "supported": "strong"
    472     },
    473     {
    474       "claim": "EvoGPT's different prompt-temperature configurations produce semantically distinct test suites",
    475       "evidence": "Intra-configuration Jaccard similarity = 0.526 vs inter-configuration = 0.476; lower inter-config similarity confirms distinct behavioral coverage, though the absolute difference is modest.",
    476       "supported": "moderate"
    477     },
    478     {
    479       "claim": "EvoGPT remains practically viable at $0.32/class and 8 min/class while delivering meaningfully better tests",
    480       "evidence": "Table VII reports cost and runtime; authors argue this is 'within a manageable range for many software engineering use cases' including nightly builds.",
    481       "supported": "moderate"
    482     }
    483   ],
    484   "methodology_tags": [
    485     "benchmark-eval"
    486   ],
    487   "key_findings": "EvoGPT, a hybrid LLM-evolutionary algorithm system, achieves statistically significant improvements (p<0.001, Cliff's δ≥0.75) over both LLM-only (TestART) and EA-only (EvoSuite) baselines across all 17 Defects4J Java projects and three test quality metrics. A structured ablation reveals that naive LLM+EA integration provides only marginal benefit (+1.5pp LCCT), while structured diversity via multiple prompt strategies and temperature settings combined with a diversity-aware plateau escape mechanism accounts for the majority of gains (8.6pp total LCCT improvement). The system costs 32× more per class than TestART ($0.32 vs $0.01) and stochastic variance across runs is acknowledged but not quantified, leaving reproducibility and stability as open questions.",
    488   "red_flags": [
    489     {
    490       "flag": "No variance across runs",
    491       "detail": "Results are single-run averages; no standard deviation across random seeds is reported despite stochastic LLM sampling. The construct validity section acknowledges this gap but does not address it."
    492     },
    493     {
    494       "flag": "Model version not pinned",
    495       "detail": "gpt-4o-mini is used without a snapshot date; the paper itself notes OpenAI periodically updates this model, making exact reproduction impossible."
    496     },
    497     {
    498       "flag": "Compute budget confound",
    499       "detail": "EvoGPT costs 32× more than TestART per class. The ablation does not fully control for the number of LLM API calls across configurations, so gains from structured diversity vs. sheer call volume are not cleanly separated."
    500     },
    501     {
    502       "flag": "Benchmark contamination not addressed",
    503       "detail": "Defects4J is a well-known public Java benchmark likely present in gpt-4o-mini's training data. If the model has memorized test patterns for these classes, results may overstate generalizability to unseen code."
    504     },
    505     {
    506       "flag": "Prompts not in paper",
    507       "detail": "Exact system prompts are not reproduced in the paper — only qualitative descriptions in Table I. Reproducibility requires access to the external code repository."
    508     }
    509   ],
    510   "cited_papers": [
    511     {
    512       "title": "CodaMosa: Escaping Coverage Plateaus in Test Generation with Pre-Trained Large Language Models",
    513       "relevance": "Direct inspiration for EvoGPT's plateau-escape mechanism; shows LLMs can improve EA coverage when search stagnates"
    514     },
    515     {
    516       "title": "TestART: Improving LLM-Based Unit Testing via Co-Evolution of Automated Generation and Repair Iteration",
    517       "relevance": "Primary LLM-based baseline and source of the generation-repair loop design adapted in EvoGPT"
    518     },
    519     {
    520       "title": "Whole Test Suite Generation (EvoSuite)",
    521       "relevance": "Primary SBST baseline and standard evolutionary test generation tool whose operators EvoGPT adapts"
    522     },
    523     {
    524       "title": "An Empirical Evaluation of Using Large Language Models for Automated Unit Test Generation",
    525       "relevance": "Shows LLM test generation limitations vs SBST in structural coverage and motivates hybrid approaches"
    526     },
    527     {
    528       "title": "Test Wars: A Comparative Study of SBST, Symbolic Execution, and LLM-Based Approaches to Unit Test Generation",
    529       "relevance": "Direct comparative study of test generation paradigms used to contextualize EvoGPT's positioning"
    530     },
    531     {
    532       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs",
    533       "relevance": "The benchmark used for all experimental evaluation in this paper"
    534     },
    535     {
    536       "title": "Optimizing Search-Based Unit Test Generation with Large Language Models: An Empirical Study",
    537       "relevance": "Prior work identifying when during SBST (initialization, post-stagnation) LLMs provide most benefit"
    538     },
    539     {
    540       "title": "Evolutionary Computation in the Era of Large Language Models: Survey and Roadmap",
    541       "relevance": "Taxonomizes LLM-EA hybrid approaches including LLM-enhanced EAs, the category EvoGPT falls under"
    542     }
    543   ],
    544   "engagement_factors": {
    545     "practical_relevance": {
    546       "score": 2,
    547       "justification": "Code is released and the approach could be integrated into Java test generation workflows, though the $0.32/class cost and 8-minute runtime limit immediate adoption."
    548     },
    549     "surprise_contrarian": {
    550       "score": 1,
    551       "justification": "The finding that naive LLM+EA integration provides limited benefit while diversity is key is somewhat surprising, but the overall result that hybrid approaches outperform individual ones confirms expectations."
    552     },
    553     "fear_safety": {
    554       "score": 0,
    555       "justification": "Test generation tool with no safety or security implications."
    556     },
    557     "drama_conflict": {
    558       "score": 0,
    559       "justification": "No controversy; straightforward benchmark comparison with incremental improvements."
    560     },
    561     "demo_ability": {
    562       "score": 2,
    563       "justification": "Code and scripts released at a public URL; requires Java setup and OpenAI API key to run."
    564     },
    565     "brand_recognition": {
    566       "score": 0,
    567       "justification": "Academic group from Ben-Gurion University; not a well-known AI lab or company."
    568     }
    569   },
    570   "hn_data": {
    571     "threads": [
    572       {
    573         "hn_id": "44554865",
    574         "title": "Emergent Misalignment: Narrow finetuning can produce broadly misaligned LLMs",
    575         "points": 181,
    576         "comments": 48,
    577         "url": "https://news.ycombinator.com/item?id=44554865"
    578       },
    579       {
    580         "hn_id": "23872019",
    581         "title": "What changed in OpenSSL after heartbleed",
    582         "points": 158,
    583         "comments": 64,
    584         "url": "https://news.ycombinator.com/item?id=23872019"
    585       },
    586       {
    587         "hn_id": "42807387",
    588         "title": "A Faster Quantum Fourier Transform",
    589         "points": 89,
    590         "comments": 6,
    591         "url": "https://news.ycombinator.com/item?id=42807387"
    592       },
    593       {
    594         "hn_id": "32977887",
    595         "title": "Katara: Synthesizing CRDTs with Verified Lifting",
    596         "points": 86,
    597         "comments": 20,
    598         "url": "https://news.ycombinator.com/item?id=32977887"
    599       },
    600       {
    601         "hn_id": "45947534",
    602         "title": "Near-Perfect Broadband Quantum Memory Enabled by Spin-Wave Compaction",
    603         "points": 2,
    604         "comments": 1,
    605         "url": "https://news.ycombinator.com/item?id=45947534"
    606       },
    607       {
    608         "hn_id": "43408602",
    609         "title": "EXAONE Deep: Reasoning Enhanced Language Models",
    610         "points": 2,
    611         "comments": 0,
    612         "url": "https://news.ycombinator.com/item?id=43408602"
    613       },
    614       {
    615         "hn_id": "44672638",
    616         "title": "Promptomatix: An Automatic Prompt Optimization Framework for LLMs",
    617         "points": 1,
    618         "comments": 0,
    619         "url": "https://news.ycombinator.com/item?id=44672638"
    620       },
    621       {
    622         "hn_id": "43729080",
    623         "title": "The Most Expensive Part of an LLM Should Be Its Training Data",
    624         "points": 1,
    625         "comments": 0,
    626         "url": "https://news.ycombinator.com/item?id=43729080"
    627       },
    628       {
    629         "hn_id": "28490088",
    630         "title": "Leaky Front Ends: Security Vulnerabilities in Processor Front Ends",
    631         "points": 1,
    632         "comments": 0,
    633         "url": "https://news.ycombinator.com/item?id=28490088"
    634       }
    635     ],
    636     "top_points": 181,
    637     "total_points": 521,
    638     "total_comments": 139
    639   }
    640 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs