scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30832B)
      1 {
      2   "paper": {
      3     "title": "PRIMG: Efficient LLM-driven Test Generation Using Mutant Prioritization",
      4     "authors": [
      5       "Mohamed Salah Bouafif",
      6       "Mohammad Hamdaqa",
      7       "Edward Zulkoski"
      8     ],
      9     "year": 2025,
     10     "venue": "EASE 2025 (29th International Conference on Evaluation and Assessment in Software Engineering)",
     11     "arxiv_id": "2505.05584",
     12     "doi": "10.1145/3756681.3756991"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "PRIMG combines ML-based mutant prioritization with LLM-driven test generation for Solidity smart contracts. The refining module dramatically improves test correctness from ~3% (single-shot) to ~33% (5 iterations), with diminishing returns beyond 5 iterations. Prioritized mutant selection significantly outperforms random selection in mutants killed across 3 real-world projects from Code4Arena. However, the overall mutation score impact was explicitly not evaluated, undermining the paper's central abstract claim.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "GitHub repository provided in Section 6: 'All the scripts and results that have been used during our experiments can be found in this github repository https://github.com/Salah-SH/llm-mutation-testing.' Also mentioned in Section 4.1.3."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Section 4.1.3 states 'All scripts and dataset can be found here' with link to GitHub repository. The projects are from publicly accessible Code4Arena."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions 'EC2 instance of type g5.xlarge with 26 GiB of GPU memory' (Section 4.1) and tools (SuMo, Hardhat, Ganache), but no requirements.txt, Dockerfile, or detailed library version specifications are provided."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are described in the paper. The GitHub repository is referenced but the paper itself does not include a reproduction guide."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper reports Z-test statistics and p-values but does not provide confidence intervals or error bars on the main results (Tables 2 and 3)."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Pairwise Z-tests for proportions are used throughout: RQ1.1 compares single-shot vs refining (e.g., Allobase Z=-19.08, p<0.05), RQ1.2 compares loop sizes, and RQ2 compares prioritized vs random (e.g., Allobase Z=-15.98, p<0.05)."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Raw proportions with baseline context are reported throughout. E.g., Table 2: Allobase single-shot 23/500 (4.6%) vs refining loop-5 201/500 (40.2%). Table 3 shows raw mutant kill counts for prioritized vs random sets, providing enough context to assess effect magnitude."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No justification for why 500 tests per project, why 50 tests per random set in Table 3, or why 3 projects. No power analysis."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Table 3 shows 3 random test sets with varying results (e.g., Allobase: 200, 158, 8) but no standard deviation, IQR, or formal variance measure is reported. The spread across random sets is visible but not quantified."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Two baselines are used: (1) single-shot LLM generation (no refining) vs refining module in RQ1, and (2) random mutant selection vs prioritized selection in RQ2."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "The only baselines are single-shot prompting and random mutant selection. No comparison with other LLM-based test generation approaches (e.g., ChatTester, AID which are cited in related work) or other prioritization techniques beyond random. The paper itself acknowledges random selection is the status quo, but contemporary LLM-based alternatives exist."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The two-component system is effectively ablated: RQ1 tests the generation module with/without refining (loop sizes 1, 5, 10), and RQ2 tests prioritized vs random selection, isolating each component's contribution."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Two distinct metrics are used: (1) proportion of correct tests generated (syntactic + behavioral correctness) in RQ1, and (2) number of mutants killed in RQ2."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "Evaluation is entirely automated: compilation check (syntax verification) and test execution against the PUT (behavior verification). No human evaluation of test quality, readability, or meaningfulness."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "For the ML model, Section 3.2.3 describes 'Project-Only: training on mutants in all but one held-out smart contract in a single project and testing on mutants in the held-out smart contract.' The evaluation uses this hold-one-out scheme."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down per project (Allobase, Particle, Quadrata) in Tables 2 and 3, with separate Z-test results for each. Table 1 provides project characteristics."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 5.1 discusses why LLMs fail to generate correct tests even after multiple iterations: hallucination, token-by-token generation inconsistencies, limited context windows, and struggles with complex multi-line fixes."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Finding 2 reports that increasing from 5 to 10 refining loops shows no significant improvement (diminishing returns). The paper also acknowledges the LLM still fails on many tasks regardless of iteration count."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The abstract claims PRIMG 'significantly reduces test suite size while maintaining high mutation coverage.' However, Section 7 explicitly admits: 'this study did not evaluate the number of newly killed mutants or the impact of the new tests on the overall mutation score.' The central claim about maintaining mutation coverage is unsupported by the paper's own evidence."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper's causal claims ('refining module significantly improves correctness,' 'prioritization outperforms random') are supported by controlled comparisons: with/without refining (single-shot vs iterative), and prioritized vs random selection, both with Z-test confirmation. The ablation-like design adequately supports these causal claims."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title 'Efficient LLM-driven Test Generation Using Mutant Prioritization' is unbounded — no mention of Solidity. The conclusion claims 'its design and evaluation methodology are fundamentally adaptable to various programming languages and models' despite testing only on 3 Solidity projects with a single LLM (Llama 3.1 8B)."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "Section 7 discusses validity threats but not alternative explanations for the observed results. No consideration of whether prioritized mutants are simply easier to kill, whether the LLM has seen Code4Arena projects in training, or whether the refining improvement is due to random restarts vs actual error-driven refinement."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper measures test correctness (compilation + passing PUT) and mutants killed, and frames these directly as test generation effectiveness. There is no significant gap between what is measured and what is claimed at the measurement level, though the abstract overclaims about mutation coverage."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Section 4.1.1 states 'We use llama 3.1 with 8B parameter.' Llama 3.1 8B identifies the model family, version, and size. For the ML model: 'ridge regression' with specific configurations. However, the Llama variant (base vs instruct) is not specified."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Section 3.1.1 describes prompt components (PUT, mutant code, initial test file) and Section 3.1.2 describes augmented prompts conceptually, but the actual prompt text is never provided. Only the structure is described: 'CONCAT(initTest, prioritizedMutant, PUT).'"
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "ML model hyperparameters are reported (ridge regression, random forest with max depth 3 and 10 trees). However, LLM inference parameters (temperature, top-p, max tokens, sampling strategy) are completely absent, which significantly affects generation behavior."
    157       },
    158       "scaffolding_described": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The workflow is described in detail: Algorithm 1 (PRIMG main flow), Algorithm 2 (RefiningTest with syntax/behavior verification loop), Figure 1 (system overview), and the prioritization module (Section 3.2). The retry logic, feedback mechanism (error logs fed back as augmented prompts), and loop termination conditions are specified."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "Project selection criteria are stated (>10 smart contracts, initial test suite, mutation score <80%) but no pipeline counts are given — how many Code4Arena projects were considered initially, how many passed each criterion. The mutant generation process is described but intermediate counts (total mutants, killed vs survived) are not reported."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 7 'Threat to Validity' provides substantive discussion across multiple paragraphs covering prompt design, mutation operators, testing framework, missing metrics, and resource constraints."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 7 discusses threats specific to this study: (1) SuMo mutation operators may not cover all smart contract vulnerabilities, (2) exclusive use of Hardhat may limit applicability, (3) did not evaluate mutation score impact, (4) limited set of prompt techniques explored, (5) resource constraints restricted scope."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "While Section 7 mentions framework-specific and operator-specific limitations, the conclusion actively extends beyond scope: 'its design and evaluation methodology are fundamentally adaptable to various programming languages and models.' The paper does not explicitly state what the results do NOT show beyond generic validity threats."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 6 states 'All the scripts and results that have been used during our experiments can be found in this github repository https://github.com/Salah-SH/llm-mutation-testing.'"
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 4.1.3 describes using Code4Arena as the source, the selection criteria for projects (number of contracts, initial test suite, mutation score), and Table 1 provides project characteristics. SuMo is described as the mutant generator with 25 Solidity-specific and 19 general-purpose operators."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. Data comes from Code4Arena, a public platform for smart contract auditing. Project selection criteria are described in Section 4.1.3."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "The overall flow is described conceptually (project selection → mutation testing → ML labeling → test generation) but counts at intermediate stages are missing. Total mutants generated per project, killed vs survived breakdown, and feature extraction statistics are not reported."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding or acknowledgments section is present in the paper. No mention of grants, sponsors, or funding agencies."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly listed: two authors from Polytechnique Montréal, one (Edward Zulkoski) from Quantstamp, Inc., a smart contract security auditing company."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding is disclosed, so independence cannot be assessed. Notably, one author is from Quantstamp Inc., a company that performs smart contract security audits and would directly benefit from better automated testing tools. This conflict is not acknowledged."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement. Edward Zulkoski's affiliation with Quantstamp (a commercial smart contract auditing company) represents a potential financial interest that is not declared beyond the affiliation line."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The paper uses Llama 3.1 8B to generate test cases for publicly available Code4Arena projects but does not state the model's training data cutoff date. The LLM may have seen these projects during training."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of whether Code4Arena projects appeared in Llama 3.1's training data. Since Code4Arena projects are publicly available on GitHub, this is a real contamination risk."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "Code4Arena projects and their test suites are publicly available online. Llama 3.1 was trained on web data that likely includes GitHub repositories. This contamination risk is not addressed."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. The evaluation is entirely automated."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. The study evaluates automated test generation on smart contract code."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No inference cost, tokens consumed, or wall-clock time reported. The approach calls Llama 3.1 up to 10 times per mutant across 500 mutants per project, but the cost of this is not quantified."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "Section 4.1 mentions 'EC2 instance of type g5.xlarge with 26 GiB of GPU memory' but does not quantify total GPU hours, experiment duration, or overall compute budget."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No seed sensitivity analysis. The LLM generation is inherently stochastic but no analysis of variability across seeds or random initializations is provided. Table 3 shows variation across random mutant selections but not across repeated runs of the same configuration."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "The number of test generation attempts is stated: 500 per project (Section 4.2.2). Loop sizes of 1, 5, and 10 are tested (Section 4.2.1). Table 3 uses 3 random sets of 50 tests and 1 prioritized set."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Section 3.2.3 describes the ML model search: 'We evaluated every combination of the following choices' — 2 models (ridge regression, random forest) × 1 feature set × 2 training settings. The search space and selection method are documented, though compute cost is not."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "Section 3.2.3 states: 'Building on findings in [15], we choose the following configuration: Model=Ridge regression, Features=All, Training Setting=Project-Only.' Configuration selected based on prior work and own evaluation across combinations."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Multiple Z-tests are performed across 3 projects and multiple comparisons (1 vs 5, 1 vs 10, 5 vs 10 trials; prioritized vs random). No Bonferroni or other multiple comparison correction is applied."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors implement and evaluate their own PRIMG system against a random baseline they also implement. No acknowledgment of self-comparison bias or independent evaluation."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The loop size analysis (1, 5, 10 iterations) implicitly relates compute to performance, but no explicit compute-vs-performance analysis is provided. The compute cost of prioritized vs random selection is not compared."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "No discussion of whether Code4Arena projects are representative of smart contracts generally, whether mutation score is a valid proxy for test quality, or whether the 3 selected projects are sufficient to support general claims."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": true,
    339         "answer": true,
    340         "justification": "The prioritized vs random comparison in RQ2 uses the same test generation scaffold (same LLM, same refining process), controlling for the scaffold confound. The difference is solely in mutant selection strategy."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether Code4Arena projects existed before Llama 3.1's training cutoff. These projects are publicly available and could have been in the training data."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the prompts (which include the full PUT and mutant code) provide information that leaks the correct answer. No analysis of whether the LLM is generating tests from memorization vs understanding."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of independence between the 3 projects or whether patterns learned from one Code4Arena project transfer to others due to structural similarity."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, or decontamination analysis."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "The refining module significantly improves correctness of LLM-generated tests compared to single-shot generation.",
    369       "evidence": "Table 2 shows Allobase: 4.6% (single-shot) → 40.2% (5 iterations); Particle: 2% → 30.2%; Quadrata: 4.2% → 28%. Z-tests confirm significance (e.g., Allobase Z=-19.08, p<0.05). Section 4.2.1.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "No significant improvement in test correctness when increasing refining loops from 5 to 10 iterations.",
    374       "evidence": "Z-tests show no significant difference between 5 and 10 trials across all projects: Allobase Z=-0.22, Particle Z=0.00, Quadrata Z=-0.35, all p>0.05. Section 4.2.1, Table 2.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Prioritized mutant selection significantly outperforms random selection in killing mutants.",
    379       "evidence": "Table 3 shows prioritized tests kill more mutants than random sets across all projects. Z-tests: Allobase Z=-15.98, Particle Z=-12.94, Quadrata Z=-14.12, all p<0.05. Section 4.2.2.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "PRIMG significantly reduces test suite size while maintaining high mutation coverage.",
    384       "evidence": "Abstract claim. Section 7 explicitly admits: 'this study did not evaluate the number of newly killed mutants or the impact of the new tests on the overall mutation score.' The mutation coverage claim is unsupported.",
    385       "supported": "weak"
    386     }
    387   ],
    388   "red_flags": [
    389     {
    390       "flag": "Tiny sample size",
    391       "detail": "Only 3 Solidity projects from Code4Arena are evaluated. This is insufficient to support the generalization claims made in the title and conclusion."
    392     },
    393     {
    394       "flag": "Missing key metric",
    395       "detail": "Section 7 explicitly admits the paper 'did not evaluate the number of newly killed mutants or the impact of the new tests on the overall mutation score,' despite mutation coverage being a central claim in the abstract."
    396     },
    397     {
    398       "flag": "Abstract overclaims",
    399       "detail": "The abstract claims 'maintaining high mutation coverage' but the paper does not measure mutation score improvement. This is a direct contradiction between the abstract and the evaluation."
    400     },
    401     {
    402       "flag": "No comparison with contemporary approaches",
    403       "detail": "Despite citing ChatTester and AID in related work, PRIMG is only compared against random mutant selection. No head-to-head comparison with other LLM-based test generation tools."
    404     },
    405     {
    406       "flag": "Potential conflict of interest",
    407       "detail": "Co-author Edward Zulkoski is from Quantstamp Inc., a commercial smart contract security auditing company that would directly benefit from better automated testing tools. No COI disclosure or funding statement is present."
    408     },
    409     {
    410       "flag": "Incomplete manuscript",
    411       "detail": "Section 4.1.1 contains an incomplete sentence: 'Llama3.1 is the leading' — the sentence is cut off, suggesting the manuscript was not fully proofread."
    412     },
    413     {
    414       "flag": "Contamination risk unaddressed",
    415       "detail": "Code4Arena projects are publicly available on GitHub. Llama 3.1 was trained on web-scale data likely including these repositories. The LLM may have memorized test patterns for these specific contracts, inflating correctness rates."
    416     }
    417   ],
    418   "cited_papers": [
    419     {
    420       "title": "Effective test generation using pre-trained large language models and mutation testing",
    421       "authors": ["Arghavan Moradi Dakhel", "Amin Nikanjam", "Vahid Majdinasab", "Foutse Khomh", "Michel C Desmarais"],
    422       "year": 2024,
    423       "relevance": "Directly related: uses mutation testing to guide LLMs in test generation, the primary inspiration for PRIMG."
    424     },
    425     {
    426       "title": "No more manual tests? evaluating and improving chatgpt for unit test generation",
    427       "authors": ["Zhiqiang Yuan", "Yiling Lou", "Mingwei Liu"],
    428       "year": 2023,
    429       "arxiv_id": "2305.04207",
    430       "relevance": "ChatTester: iteratively refines LLM-generated test cases, directly comparable approach to PRIMG's refining module."
    431     },
    432     {
    433       "title": "LLM-Powered Test Case Generation for Detecting Tricky Bugs",
    434       "authors": ["Kaibo Liu", "Yiyang Liu", "Zhenpeng Chen"],
    435       "year": 2024,
    436       "arxiv_id": "2404.10304",
    437       "relevance": "AID framework combining LLMs with differential testing for fault-revealing test generation."
    438     },
    439     {
    440       "title": "VALTEST: Automated Validation of Language Model Generated Test Cases",
    441       "authors": ["Hamed Taherkhani", "Hadi Hemmati"],
    442       "year": 2024,
    443       "arxiv_id": "2411.08254",
    444       "relevance": "Addresses accuracy issues in LLM-generated tests, directly relevant to test validation challenges."
    445     },
    446     {
    447       "title": "Large-scale, Independent and Comprehensive study of the power of LLMs for test case generation",
    448       "authors": ["Wendkûuni C Ouédraogo", "Kader Kaboré", "Haoye Tian"],
    449       "year": 2024,
    450       "arxiv_id": "2407.00225",
    451       "relevance": "Large-scale evaluation of LLM capabilities for test generation across multiple settings."
    452     },
    453     {
    454       "title": "Evaluating large language models trained on code",
    455       "authors": ["Mark Chen", "Jerry Tworek"],
    456       "year": 2021,
    457       "arxiv_id": "2107.03374",
    458       "relevance": "Codex paper — foundational work on evaluating LLMs for code generation."
    459     },
    460     {
    461       "title": "Large language models for software engineering: A systematic literature review",
    462       "authors": ["Xinyi Hou", "Yanjie Zhao", "Yue Liu"],
    463       "year": 2023,
    464       "relevance": "Comprehensive survey of LLMs for SE tasks including test generation."
    465     },
    466     {
    467       "title": "What's Wrong with Your Code Generated by Large Language Models? An Extensive Study",
    468       "authors": ["Shihan Dou", "Haoxiang Jia", "Shenxi Wu"],
    469       "year": 2024,
    470       "arxiv_id": "2407.06153",
    471       "relevance": "Studies quality issues in LLM-generated code including functional bugs and hallucinations."
    472     },
    473     {
    474       "title": "LLMorpheus: Mutation Testing using Large Language Models",
    475       "authors": ["Frank Tip", "Jonathan Bell", "Max Schäfer"],
    476       "year": 2024,
    477       "arxiv_id": "2404.09952",
    478       "relevance": "Uses LLMs for mutation generation itself, complementary to PRIMG's use of LLMs for test generation."
    479     },
    480     {
    481       "title": "A deep dive into large language models for automated bug localization and repair",
    482       "authors": ["Soneya Binta Hossain", "Nan Jiang", "Qiang Zhou"],
    483       "year": 2024,
    484       "relevance": "Studies LLM capabilities and limitations for automated bug detection and repair in software."
    485     },
    486     {
    487       "title": "Leveraging Propagated Infection to Crossfire Mutants",
    488       "authors": ["Hang Du", "Vijay Krishna Palepu", "James A Jones"],
    489       "year": 2024,
    490       "arxiv_id": "2411.09846",
    491       "relevance": "Advances mutation testing methodology by leveraging propagated infection for mutant detection."
    492     }
    493   ],
    494   "engagement_factors": {
    495     "practical_relevance": {
    496       "score": 2,
    497       "justification": "Solidity smart contract developers could use PRIMG's approach for automated test generation, though the tool is domain-specific and the LLM choice (Llama 3.1 8B) is modest."
    498     },
    499     "surprise_contrarian": {
    500       "score": 0,
    501       "justification": "Results confirm expectations: iterative refinement helps LLM output, and informed selection outperforms random selection."
    502     },
    503     "fear_safety": {
    504       "score": 1,
    505       "justification": "Smart contract testing is security-relevant, but this paper improves testing rather than demonstrating attacks or novel vulnerabilities."
    506     },
    507     "drama_conflict": {
    508       "score": 0,
    509       "justification": "No controversy or challenges to established beliefs."
    510     },
    511     "demo_ability": {
    512       "score": 2,
    513       "justification": "GitHub repository with scripts and datasets is provided, making the tool somewhat reproducible for those with the infrastructure."
    514     },
    515     "brand_recognition": {
    516       "score": 0,
    517       "justification": "Academic group from Polytechnique Montréal and small company Quantstamp; not a major AI lab."
    518     }
    519   }
    520 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs