scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32506B)
      1 {
      2   "paper": {
      3     "title": "Hallucination to Consensus: Multi-Agent LLMs for End-to-End JUnit Test Generation",
      4     "authors": [
      5       "Qinghua Xu",
      6       "Guancheng Wang",
      7       "Lionel Briand",
      8       "Kui Liu"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv",
     12     "arxiv_id": "2506.02943"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "CANDOR, a multi-agent LLM framework for JUnit test generation using Llama 3.1 70B and DeepSeek R1, achieves line/branch coverage comparable to EvoSuite while significantly outperforming it in mutation score (≥0.049 improvement) on HumanEvalJava and LeetCodeJava. Its panel-discussion oracle strategy outperforms the fine-tuned SOTA oracle generator TOGLL by ≥21.1 percentage points in oracle correctness on both correct and faulty code. Ablation studies confirm the Planner agent is critical for coverage and the panel discussion is critical for oracle accuracy, with removal causing 0.067–0.086 drops in oracle correctness.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "Section 4.4 states 'we plan to release the code publicly upon paper acceptance.' This is a promise of future release, not actual availability."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "HumanEvalJava is a publicly available benchmark [8]. LeetCodeJava is constructed from publicly available LeetCode solutions hosted on GitHub [21]. Both datasets are accessible."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Section 4.4 mentions hardware (Precision 7960 Tower, Intel Xeon w9-3495X, dual NVIDIA RTX 6000 Ada GPUs) and that the implementation uses Python with LangChain, but no requirements.txt, dependency versions, or environment specification file is provided."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are provided. Code is not yet released, and the paper does not include a README or 'Reproducing Results' section."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Tables 1 and 2 and Figure 4 report only point estimates (averages over 3 runs). No confidence intervals, error bars, or ± notation are provided."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Wilcoxon Signed Rank tests are conducted for all comparisons in RQ1, RQ2, and RQ3, with significance level 0.05. Results are marked with '*' in tables."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Vargha and Delaney's A12 effect size is reported for EVO-CANDOR vs TOGLL comparisons (A12=0.920 on correct code, A12=0.960 on faulty code). Absolute differences are also reported throughout (e.g., '21.1 percentage points')."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No power analysis or statistical justification for dataset sizes. The choice of 100 LeetCode problems (50 medium + 50 hard) is explained by 'time and computational resource constraints' but not statistically justified."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Although experiments are repeated three times and averaged (Section 4.3), no standard deviations, IQR, or spread measures are reported in any table or figure. Only point estimates are shown."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Four baselines are included: EvoSuite (search-based SOTA for test prefixes), LLM-Empirical (prompt-based SOTA), TOGLL (fine-tuning-based SOTA for oracles), and EVO-CANDOR (variant for fair oracle comparison). Section 4.2 describes each."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "TOGLL (2024) and LLM-Empirical (2024) are recent. EvoSuite is from 2011 but the paper justifies its inclusion because it remains SOTA for test prefix coverage, citing Tang et al. (2024) who confirmed this."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "RQ3 (Section 5.3, Table 2) presents ablation studies: w/o Planner, w/o Requirement Engineer, w/o Panel discussion, and w/ Voting (majority voting variant). Each ablation isolates one component."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Four metrics are used: line coverage, branch coverage, mutation score (Section 4.3), and oracle correctness (Section 4.3). RQ1 evaluates the first three, RQ2 evaluates oracle correctness."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation of generated test quality, readability, or usefulness is included. All evaluation is automated (coverage tools, PiTest, oracle correctness against ground truth)."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "Hyperparameters such as number of pipelines (tested 1–5, selected 3) and EvoSuite timeout (tested 1–120 min, selected 2 min) appear to have been tuned on the same datasets used for final evaluation. No separate validation set is described."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down by dataset (HumanEvalJava, Leetcode-Medium, Leetcode-Hard) in Tables 1, 2 and Figure 4. The appendix provides per-model breakdowns (Llama, CodeLlama, Mistral)."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 6.1 discusses failure modes: LLMs producing Python syntax instead of Java, hallucinations where 'a method max_element() was mistakenly used to compute the minimum,' and the overthinking phenomenon of reasoning LLMs."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "EvoSuite achieves higher branch coverage than CANDOR on Leetcode-Medium by 0.01 (Table 1). Mistral 22B performs substantially worse (Appendix Table 3). The paper also notes 'over 70% of the cases, the Panelists' discussions contained clear disagreements.'"
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims of comparable coverage with EvoSuite are supported by Table 1 (non-significant differences). The '21.1 percentage points' oracle improvement claim is supported by Figure 4 (minimum gap of 0.211 on Leetcode-Medium faulty code). Ablation claims are supported by Table 2."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Causal claims are made via ablation studies (Section 5.3): 'Removing the Planner significantly reduces test prefix quality' and 'removing the panel discussion causes substantial drops in oracle correctness.' The ablation design uses controlled single-variable manipulation, which is adequate for these claims."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 4.1 explicitly states: 'we consider only Java methods that do not depend on user-defined classes or external libraries, thus excluding datasets such as Defects4J and SF110.' Section 6.2 acknowledges evaluation is on only two datasets and may not generalize to complex programs with dependencies."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Section 6.2 discusses alternative explanations: LLM choice may affect results, data leakage could inflate performance (mitigated by mutation score), and randomness in outputs. Section 5.1 discusses why CANDOR achieves higher mutation scores (semantic understanding vs structural coverage)."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Section 6.2 explicitly acknowledges: 'we use the mutation score as a proxy for bug-finding capability.' The paper clearly distinguishes between what is measured (coverage, mutation score, oracle correctness) and the broader goal (test quality). The proxy gap between mutation score and real bug detection is acknowledged."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Section 4.4 specifies 'Llama 3.1 70B as the basic LLM and DeepSeek R1 Llama-distilled 70B as the reasoning LLM.' Alternatives tested include 'CodeLlama 70B and Mistral 22B.' For baselines, 'GPT-3.5-Turbo' and 'CodeParrot' are specified. These are specific model identifiers with sizes."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Figure 3 provides the full system prompts and user prompt templates for all 8 agents (Initializer, Planner, Tester, Inspector, Requirement Engineer, Panelist, Interpreter, Curator). The template variables (source_code, description, etc.) are filled from the public datasets, enabling prompt reconstruction."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "Section 4.4 reports max_attempts=3, DeepSeek output limit of 2000 tokens, and 3 pipelines. EvoSuite assertion_timeout=2min and assertion_strategy='mutation' are reported. However, LLM temperature, top-p, and other sampling parameters are not reported."
    157       },
    158       "scaffolding_described": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The multi-agent scaffolding is described in detail in Section 3 with Figures 2 and 3. All agents (Initializer, Planner, Tester, Inspector, Requirement Engineer, Panelist, Interpreter, Curator) have defined roles, prompts, input/output specifications, and iterative feedback loops."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 4.1 describes HumanEvalJava (160 programs from [8]) and LeetCodeJava construction (randomly sampled 50 medium + 50 hard from LeetCode, solutions from [21]). Section 4.3 describes mutant generation using PiTest with default mutation operators (3 mutants per SUT)."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 6.2 'Threats to validity' provides a structured discussion with four subsections: Construct Validity, Internal Validity, Conclusion Validity, and External Validity."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 6.2 discusses specific threats: choice of Llama 3.1 70B/DeepSeek R1 may affect results, data leakage from pretraining data, LLM output randomness requiring 3 repetitions, and limitation to two datasets excluding complex programs with external dependencies."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 4.1 explicitly states: 'we consider only Java methods that do not depend on user-defined classes or external libraries, thus excluding datasets such as Defects4J and SF110. Addressing such dependencies requires additional techniques, such as retrieval-augmented generation and mocking, which we plan to address in the future.'"
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No raw experimental data (generated test files, LLM outputs, per-method results) is made available. Only aggregate results are reported in tables and figures."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "HumanEvalJava is described as 160 Java programs with average 41 LOC and CC 4.90 (Section 4.1). LeetCodeJava construction is described: 50 medium + 50 hard problems randomly sampled, solutions from publicly maintained GitHub repository [21], with LOC and CC statistics reported."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. Data sources are standard benchmarks (HumanEvalJava) and a constructed dataset from public LeetCode solutions."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The pipeline from source code to final test file is documented across Section 3 (Initialization → Test Prefix Generation → Oracle Fixing). The evaluation pipeline is described in Section 4.3 (compilation, execution, Jacoco coverage, PiTest mutation, oracle correctness check)."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No explicit funding acknowledgment section is present in the paper. Authors are affiliated with Research Ireland Lero Centre and Huawei, but no grants or funding sources are listed."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "All author affiliations are clearly stated: three authors at Research Ireland Lero Centre for Software and University of Limerick, one at University of Ottawa, and Kui Liu at Software Engineering Application Technology Lab, Huawei."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "Cannot determine funder independence without explicit funding disclosure. One author is from Huawei, which has commercial interests in software testing tools, though Huawei products are not directly evaluated."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial disclosure statement is present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No training data cutoff dates are stated for Llama 3.1 70B or DeepSeek R1. The paper uses these models on HumanEvalJava (published 2022) without establishing whether the benchmark predates training data collection."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "Section 6.2 Internal Validity states: 'Another potential threat is data leakage, where our datasets may be included in the LLMs' pretraining data, leading to inflated performance. To mitigate this, we evaluate the mutation score, which detects behavioral changes in synthetically modified programs.'"
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "Section 6.2 acknowledges contamination risk and argues mutation score provides a contamination-resistant metric since 'mutated versions are highly unlikely to appear in pretraining corpora.' However, no direct contamination detection is performed."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. Evaluation is entirely automated on benchmark datasets."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No inference costs, API costs, tokens consumed, or wall-clock time per test generation are reported. Section 6.1 mentions DeepSeek can produce 10,000+ token outputs 'taking hours to complete a single test file' before truncation, but no systematic cost data is provided."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "Hardware is described (Precision 7960 Tower, Intel Xeon w9-3495X, dual NVIDIA RTX 6000 Ada GPUs) but total GPU hours, wall-clock time for the full evaluation, or total compute budget are not reported."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "Section 4.3 states 'we repeat all experiments three times,' but only averages are reported in Tables 1, 2 and Figure 4. No standard deviations, ranges, or per-run results are shown to assess sensitivity across runs."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "Section 4.3 explicitly states: 'To reduce the influence of randomness, we repeat all experiments three times and calculate the average of each metric.'"
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Section 4.4 mentions experimenting with 1–5 pipelines and EvoSuite timeouts of 1–120 minutes, but does not report the total compute spent on this search or a systematic search methodology."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "Section 4.4 justifies the selection of 3 pipelines ('improvements beyond 3 are not significant and incur much longer generation time') and 2-minute EvoSuite timeout ('coverage was not further improved after assertion_timeout > 2 min'). The Appendix reports results with alternative LLMs."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Multiple Wilcoxon Signed Rank tests are conducted across 3 datasets, 3+ baselines, and multiple metrics, but no correction for multiple comparisons (Bonferroni, Holm, etc.) is applied or mentioned."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors implement and evaluate their own system against their own re-implementations of baselines (LLM-Empirical with GPT-3.5-Turbo instead of original Codex). This potential bias is not acknowledged or discussed."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "CANDOR uses multiple LLM agents in iterative loops with 3 panelist pipelines per test case, which is dramatically more compute than LLM-Empirical's single-prompt approach or EvoSuite's search-based approach. This compute disparity is never discussed or controlled for."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "HumanEvalJava and LeetCodeJava contain standalone methods without external dependencies. The paper acknowledges this limitation but does not discuss whether these benchmarks validly measure real-world test generation capability, where code typically has complex dependencies."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "CANDOR (multi-agent, iterative, multiple LLM calls) is compared against LLM-Empirical (single-prompt) without discussing the scaffold confound. The ablation study isolates individual components within CANDOR, but the comparison with baselines does not control for scaffolding differences."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "HumanEvalJava was published in 2022, before Llama 3.1 (2024) training. The paper mentions general data leakage risk but does not specifically discuss the temporal overlap between benchmark publication and model training dates."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the evaluation setup leaks information not available in real usage. The test generation task provides source code and natural language descriptions as expected in practice, but this is not explicitly analyzed."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether train and test examples share structural similarities. HumanEvalJava problems may appear in the LLMs' training data, and LeetCode solutions are widely available online, but independence is not verified."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No concrete leakage detection method (canary strings, membership inference, n-gram overlap) is applied. The paper only argues conceptually that mutation score is resistant to leakage since 'mutated versions are highly unlikely to appear in pretraining corpora.'"
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "CANDOR achieves comparable line and branch coverage to EvoSuite across all datasets.",
    369       "evidence": "Table 1 shows CANDOR achieves 0.991/0.990/0.989 line coverage vs EvoSuite's 0.961/0.959/0.984, and 0.950/0.949/0.980 branch coverage vs 0.942/0.959/0.976. Wilcoxon tests confirm differences are not statistically significant (p>0.05).",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "CANDOR significantly outperforms EvoSuite in mutation score with improvements exceeding 0.049 on all datasets.",
    374       "evidence": "Table 1 shows CANDOR mutation scores of 0.980/0.939/0.937 vs EvoSuite's 0.858/0.845/0.888. All differences are statistically significant (p<1e-4).",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "CANDOR outperforms TOGLL by at least 21.1 percentage points in oracle correctness on both correct and faulty source code.",
    379       "evidence": "Figure 4 shows EVO-CANDOR oracle correctness of 0.874/0.922/0.857 vs TOGLL's 0.610/0.646/0.602 on correct code, and 0.855/0.823/0.821 vs 0.581/0.612/0.567 on faulty code. Minimum gap is 0.211 (Leetcode-Medium faulty). Wilcoxon tests significant (p<1e-4), A12=0.920–0.960.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Removing the panel discussion causes substantial drops in oracle correctness (0.067–0.086).",
    384       "evidence": "Table 2 RQ3 ablation: w/o Panel oracle correctness drops to 0.824/0.855/0.827 from 0.910/0.930/0.894. All differences statistically significant (p<1e-4).",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Removing the Planner leads to substantial decreases in line coverage (≥0.050), branch coverage (≥0.046), and mutation score (≥0.070).",
    389       "evidence": "Table 2 RQ3 ablation: w/o Planner shows drops to 0.892/0.940/0.935 line coverage, 0.840/0.903/0.922 branch coverage, 0.869/0.869/0.833 mutation score. All differences significant (p<1e-4).",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Panel discussion with Curator reasoning outperforms simple majority voting for oracle generation.",
    394       "evidence": "Table 2: w/ Voting achieves 0.896/0.910/0.877 oracle correctness vs CANDOR's 0.910/0.930/0.894. Decreases of 0.014–0.020, all statistically significant (p<1e-2).",
    395       "supported": "moderate"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "No variance reported despite multiple runs",
    401       "detail": "All experiments are repeated 3 times and averaged, but no standard deviations, confidence intervals, or per-run results are reported in any table or figure. Readers cannot assess result stability."
    402     },
    403     {
    404       "flag": "Compute budget disparity not discussed",
    405       "detail": "CANDOR uses 8+ LLM agents in iterative loops with 3 parallel panelist pipelines per test case, consuming dramatically more compute than the single-prompt LLM-Empirical baseline. This makes the comparison unfair on a cost-performance basis, and no cost data is provided."
    406     },
    407     {
    408       "flag": "Baseline re-implementation may be weakened",
    409       "detail": "LLM-Empirical is re-implemented with GPT-3.5-Turbo instead of the original Codex (code-davinci-002) which is no longer available. The authors acknowledge this but do not assess how much this affects the baseline's performance."
    410     },
    411     {
    412       "flag": "No multiple comparison correction",
    413       "detail": "Dozens of Wilcoxon tests are conducted across 3 datasets, multiple baselines, and multiple metrics without any family-wise error rate correction, inflating the risk of Type I errors."
    414     },
    415     {
    416       "flag": "Code not released",
    417       "detail": "Code release is promised 'upon paper acceptance' but is not currently available, preventing independent verification of results."
    418     },
    419     {
    420       "flag": "Missing LLM sampling parameters",
    421       "detail": "Temperature, top-p, and other sampling parameters for Llama 3.1 70B and DeepSeek R1 are not reported, despite these significantly affecting LLM output."
    422     }
    423   ],
    424   "cited_papers": [
    425     {
    426       "title": "Evosuite: automatic test suite generation for object-oriented software",
    427       "authors": ["Gordon Fraser", "Andrea Arcuri"],
    428       "year": 2011,
    429       "relevance": "Foundational search-based test generation tool used as the primary baseline for test prefix quality comparison."
    430     },
    431     {
    432       "title": "Togll: Correct and strong test oracle generation with llms",
    433       "authors": ["Soneya Binta Hossain", "Matthew Dwyer"],
    434       "year": 2024,
    435       "arxiv_id": "2405.03786",
    436       "relevance": "State-of-the-art fine-tuning-based oracle generator for Java that CANDOR substantially outperforms."
    437     },
    438     {
    439       "title": "Using large language models to generate junit tests: An empirical study",
    440       "authors": ["Mohammed Latif Siddiq", "Joanna Cecilia Da Silva Santos"],
    441       "year": 2024,
    442       "relevance": "Representative SOTA prompt-engineering approach for LLM-based JUnit test generation, used as a baseline."
    443     },
    444     {
    445       "title": "Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning",
    446       "authors": ["Daya Guo"],
    447       "year": 2025,
    448       "arxiv_id": "2501.12948",
    449       "relevance": "Reasoning LLM used as the Panelist agent in CANDOR's panel discussion strategy for oracle fixing."
    450     },
    451     {
    452       "title": "Toga: A neural method for test oracle generation",
    453       "authors": ["Elizabeth Dinella", "Gabriel Ryan", "Todd Mytkowicz", "Shuvendu K Lahiri"],
    454       "year": 2022,
    455       "relevance": "First LLM-based test oracle generator using fine-tuned CodeBERT, pioneering the specification-based oracle generation approach."
    456     },
    457     {
    458       "title": "Chatunitest: A framework for llm-based test generation",
    459       "authors": ["Yinghao Chen"],
    460       "year": 2024,
    461       "relevance": "LLM-based test generation framework addressing similar problems of automated unit test creation."
    462     },
    463     {
    464       "title": "Codamosa: Escaping coverage plateaus in test generation with pre-trained large language models",
    465       "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K Lahiri", "Siddhartha Sen"],
    466       "year": 2023,
    467       "relevance": "Hybrid approach combining search-based testing with LLMs to escape coverage plateaus, relevant to combining traditional and LLM-based testing."
    468     },
    469     {
    470       "title": "Chatgpt vs sbst: A comparative assessment of unit test suite generation",
    471       "authors": ["Yutian Tang", "Zhijie Liu", "Zhichao Zhou", "Xiapu Luo"],
    472       "year": 2024,
    473       "relevance": "Empirical comparison finding EvoSuite still outperforms LLM-based approaches in code coverage, motivating CANDOR's multi-agent approach."
    474     },
    475     {
    476       "title": "Unit test case generation with transformers and focal context",
    477       "authors": ["Michele Tufano", "Dawn Drain", "Alexey Svyatkovskiy"],
    478       "year": 2020,
    479       "relevance": "First LLM-based unit test generation approach (AthenaTest), pioneering the use of transformers for test generation."
    480     },
    481     {
    482       "title": "ChatAssert: LLM-Based Test Oracle Generation With External Tools Assistance",
    483       "authors": ["Ishrak Hayet", "Adam Scott", "Marcelo d'Amorim"],
    484       "year": 2025,
    485       "doi": "10.1109/TSE.2024.3519159",
    486       "relevance": "LLM-based oracle generation approach using external tools, directly relevant to specification-based oracle generation."
    487     },
    488     {
    489       "title": "Hits: High-coverage llm-based unit test generation via method slicing",
    490       "authors": ["Zejun Wang", "Kaibo Liu", "Ge Li", "Zhi Jin"],
    491       "year": 2024,
    492       "relevance": "LLM-based test generation using method slicing for high coverage, relevant to LLM-based test prefix generation strategies."
    493     },
    494     {
    495       "title": "Evaluating and Improving ChatGPT for Unit Test Generation",
    496       "authors": ["Zhiqiang Yuan", "Mingwei Liu"],
    497       "year": 2024,
    498       "doi": "10.1145/3660783",
    499       "relevance": "Evaluation of ChatGPT for unit test generation with improvement strategies, directly relevant to LLM-based testing evaluation."
    500     },
    501     {
    502       "title": "Large Language Models for Unit Testing: A Systematic Literature Review",
    503       "authors": ["Quanjun Zhang"],
    504       "year": 2025,
    505       "relevance": "Systematic review of LLM-based unit test generation covering the landscape this paper contributes to."
    506     },
    507     {
    508       "title": "CoverUp: Effective High Coverage Test Generation for Python",
    509       "authors": ["Juan Altmayer Pizzorno", "Emery D Berger"],
    510       "year": 2025,
    511       "relevance": "High-coverage LLM-based test generation for Python, demonstrating the generation-and-refinement paradigm used in CANDOR."
    512     }
    513   ],
    514   "engagement_factors": {
    515     "practical_relevance": {
    516       "score": 2,
    517       "justification": "Multi-agent framework for automated JUnit test generation is practically useful, but code is not yet released and limited to standalone Java methods."
    518     },
    519     "surprise_contrarian": {
    520       "score": 1,
    521       "justification": "Showing prompt-engineering-based multi-agent approach can outperform fine-tuned models in oracle generation is moderately interesting but not deeply contrarian."
    522     },
    523     "fear_safety": {
    524       "score": 0,
    525       "justification": "No AI risk or security concerns raised; the paper is about improving test generation quality."
    526     },
    527     "drama_conflict": {
    528       "score": 0,
    529       "justification": "No controversy or conflict; straightforward technical contribution with fair comparison framing."
    530     },
    531     "demo_ability": {
    532       "score": 0,
    533       "justification": "Code is not released; stated as planned 'upon paper acceptance' with no demo available."
    534     },
    535     "brand_recognition": {
    536       "score": 1,
    537       "justification": "Uses DeepSeek R1 which has moderate recognition; one author from Huawei; but neither is a headline-grabbing brand in this context."
    538     }
    539   }
    540 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs