scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19104B)
      1 {
      2   "paper": {
      3     "title": "CoverUp: Effective High Coverage Test Generation for Python",
      4     "authors": ["Juan Altmayer Pizzorno", "Emery D. Berger"],
      5     "year": 2025,
      6     "venue": "Proc. ACM Softw. Eng. (FSE)",
      7     "doi": "10.1145/3729398"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "CoverUp is available at https://github.com/plasma-umass/coverup and archived on Zenodo. A replication package is at https://github.com/plasma-umass/coverup-eval (Section 7, Data Availability)."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Benchmark suites are derived from open-source projects and the replication package is publicly available at https://github.com/plasma-umass/coverup-eval."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Section 4.1 specifies Docker images used for CodaMosa, Python versions (3.12.3 for CoverUp, 3.9.12 for MuTAP), and the local system is described. The replication package includes Docker-based setup."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "A replication package is provided with Zenodo archiving. The experimental setup in Section 4.1 details exact configurations, and the GitHub repos contain instructions."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No confidence intervals or error bars are reported. Results are point estimates (e.g., '80% vs. 47%') without uncertainty quantification."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Paired permutation tests are used: 'using paired permutation tests, we obtain a p-value of 2.0 x 10^-5 for both' (Section 4.2)."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Absolute coverage differences are reported with baselines for context, e.g., 'per-module median line+branch coverage of 80% (vs. 47%)' and '89% (vs. 77%)'. This provides magnitude context."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for the number of benchmark modules or power analysis. The CM suite has 35 projects, MT has its own set, but no rationale for these sizes."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Results appear to be single-run numbers. No standard deviation, IQR, or multi-run variance is reported for the main coverage results."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Multiple baselines are compared: CodaMosa (Codex and GPT-4o variants), MuTAP (Codex and GPT-4o, few-shot and zero-shot), and ablated CoverUp versions."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "CodaMosa (2023) and MuTAP are recent LLM-based test generators. The paper also updates baselines to use GPT-4o for fair comparison."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "RQ5 (Section 4.6) provides a thorough ablation study removing coverage information, code context, and error fixing components individually. Table 4 and Figure 17 show results."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple coverage metrics are reported: line coverage, branch coverage, and combined line+branch coverage, both overall and per-module median."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of test quality is included. All evaluation is automated via coverage metrics and test execution. Human judgment of test readability or maintainability would be relevant but is absent."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Not applicable — this is a test generation tool evaluated on coverage metrics, not a predictive model evaluated on held-out data."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Per-module results are provided in box plots (Figures 11-14), and results are broken down across three benchmark suites (CM, PY, MT)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "State pollution issues are discussed extensively (Section 3.6), flaky test handling is described, and the ablation study shows where components fail. Section 3.4 discusses prompt confusion issues."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that the original CodaMosa few-shot prompt confuses GPT-4o (Section 4.3), and ablations show components that hurt performance when removed. CoverUp uses 48% more tokens than CodaMosa (RQ4)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims of 80% vs. 47% (CodaMosa) and 89% vs. 77% (MuTAP) are supported by results in Sections 4.2. The ablation claim is supported in Section 4.6."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about component contributions are supported by controlled ablation studies (RQ5) with single-variable manipulation. The ablation design is adequate."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The title specifies 'Python' and claims are bounded to the tested benchmarks. The Threats to Validity section acknowledges 'selecting a different set of benchmarks could produce different results' and 'LLM model dependency.'"
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper considers that performance could stem from the LLM rather than the approach (RQ2 addresses this), and that CodaMosa's lower performance might be due to model differences (addressed by testing CodaMosa with GPT-4o)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The exact model version is specified: 'gpt-4o-2024-05-13' (Section 4.1). This is a specific snapshot identifier."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full prompt text is provided in the paper (Figures 3, 5, 6, 7, 8, 9) showing the actual system prompts and templates with concrete examples of filled-in values."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Temperature is set to zero, no output token limit, target code segment size of 50 lines (Section 4.1). Rate limiting parameters are also discussed (Section 3.7)."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The agentic scaffolding is described in detail: iterative prompting loop with coverage feedback, tool function (get_info), continued chat sessions, error fixing feedback, and checkpoint/resume logic (Sections 3.1-3.7)."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.1 documents benchmark suite construction: CM derived from CodaMosa's 35 open-source projects, PY from the Python standard library, MT from MuTAP's benchmark. Module filtering criteria are described."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 5 'Threats to Validity' provides a dedicated discussion of limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Specific threats are discussed: benchmark selection bias, execution environment differences (missing Python modules), and LLM model dependency. These are specific to this study."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The threats section states that different benchmarks could produce different results, the approach depends on GPT-4o model capabilities, and execution environment differences affected CodaMosa."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Replication package at https://github.com/plasma-umass/coverup-eval and Zenodo archiving provide access to raw experimental data."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4.1 describes how benchmark suites were assembled: CM from CodaMosa's Docker image with 35 open-source projects, PY from Python standard library modules, MT from MuTAP's benchmark."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data sources are standard benchmarks from prior work."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from source code to coverage measurement is documented: code segment extraction, prompt generation, test execution, coverage measurement, and iterative refinement (Sections 3.1-3.7)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding acknowledgment section found in the paper. One author has an Amazon Web Services affiliation but no funding disclosure."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: UMass Amherst and Amazon Web Services for Berger."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed, so independence cannot be assessed. Berger's AWS affiliation is noted but no conflict statement addresses it."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial disclosure statement found in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The model used is gpt-4o-2024-05-13, but no training data cutoff date is stated. Since the benchmarks are from public open-source code, the model may have seen them during training."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether GPT-4o's training data included the open-source projects used as benchmarks. This is a significant concern since the benchmarks are public Python projects."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The benchmark code is from public open-source projects that GPT-4o likely saw during training. This contamination risk is not discussed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Table 3 reports tokens, running time, and approximate US$ cost for CoverUp and CodaMosa on the CM suite (Section 4.5, RQ4)."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Running time is reported (4 hours vs. 71 hours for CodaMosa), token counts are provided, and the local computing environment is described in Section 4.1."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CoverUp achieves per-module median line+branch coverage of 80% vs. CodaMosa's 47% on the CM benchmark suite.",
    286       "evidence": "Section 4.2, paired permutation test p-value of 2.0 x 10^-5. Results shown in Figures 11-12.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "CoverUp achieves overall line+branch coverage of 89% vs. MuTAP's 77% on the MT benchmark suite.",
    291       "evidence": "Section 4.2, Figure 14 shows coverage comparison on MT suite.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "CoverUp's performance stems from its components (coverage info, code context, error fixing), not just the LLM.",
    296       "evidence": "Section 4.6 (RQ5) ablation study showing all three components contribute; error fixing ablation loses 14-37% on coverage metrics. Section 4.3 (RQ2) shows CoverUp outperforms ablated LLM-only version.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "CoverUp runs roughly 18x faster than CodaMosa while achieving higher coverage.",
    301       "evidence": "Table 3, Section 4.5: 4 hours vs. 71 hours. However, the paper acknowledges this comparison has limitations due to different resource utilization (cloud vs. local).",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Continued chat contributes about 40% of CoverUp's successes.",
    306       "evidence": "Section 4.4 (RQ3) analyzes success rates across chat iterations.",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "CoverUp uses coverage analysis, code context, and iterative error feedback to guide LLM-based test generation for Python, substantially outperforming CodaMosa and MuTAP on coverage metrics. On the CM benchmark, CoverUp achieves 80% per-module median line+branch coverage vs. CodaMosa's 47%. Ablation studies confirm all three components (coverage prompting, code context, error fixing) contribute meaningfully. CoverUp runs 18x faster than CodaMosa while using 48% more tokens.",
    312   "red_flags": [
    313     {
    314       "flag": "Benchmark contamination not addressed",
    315       "detail": "The benchmarks are derived from public open-source Python projects. GPT-4o was likely trained on this code, which could inflate coverage results if the model has memorized test patterns for these projects. This is never discussed."
    316     },
    317     {
    318       "flag": "Single-run results without variance",
    319       "detail": "Despite using temperature=0, LLM outputs can still vary. Results appear to be single-run without repeated trials or variance reporting, making it impossible to assess result stability."
    320     },
    321     {
    322       "flag": "No conflict of interest disclosure",
    323       "detail": "One author is affiliated with Amazon Web Services, which competes in the AI/LLM space and offers testing services. No conflict of interest statement or funding disclosure is provided."
    324     }
    325   ],
    326   "cited_papers": [
    327     {
    328       "title": "CodaMosa: Escaping Coverage Plateaus in Test Generation with Pre-trained Large Language Models",
    329       "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K. Lahiri", "Siddhartha Sen"],
    330       "year": 2023,
    331       "relevance": "Primary baseline; hybrid search/LLM-based test generator that CoverUp improves upon."
    332     },
    333     {
    334       "title": "MuTAP: Mutation Testing with LLM-based Test Generation",
    335       "authors": ["Dakhel et al."],
    336       "year": 2024,
    337       "relevance": "Second primary baseline; mutation-guided LLM test generator."
    338     },
    339     {
    340       "title": "Evaluating Large Language Models Trained on Code",
    341       "authors": ["Mark Chen et al."],
    342       "year": 2021,
    343       "arxiv_id": "2107.03374",
    344       "relevance": "Codex/HumanEval paper foundational to LLM code generation evaluation."
    345     },
    346     {
    347       "title": "An Empirical Evaluation of Using Large Language Models for Automated Unit Test Generation",
    348       "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"],
    349       "year": 2024,
    350       "relevance": "TestGen-LLM approach to LLM-based unit test generation, a related approach."
    351     },
    352     {
    353       "title": "ChatUniTest: A Framework for LLM-Based Test Generation",
    354       "authors": ["Chen et al."],
    355       "year": 2024,
    356       "relevance": "LLM-based test generation framework using ChatGPT with iterative refinement."
    357     },
    358     {
    359       "title": "Fuzz4All: Universal Fuzzing with Large Language Models",
    360       "authors": ["Xia et al."],
    361       "year": 2024,
    362       "relevance": "LLM-based fuzzing approach that generates test inputs using language models."
    363     },
    364     {
    365       "title": "Lost in the Middle: How Language Models Use Long Contexts",
    366       "authors": ["Nelson F. Liu", "Kevin Lin", "John Hewitt", "Ashwin Paranjape", "Michele Bevilacqua", "Fabio Petroni", "Percy Liang"],
    367       "year": 2023,
    368       "relevance": "Demonstrates LLM attention degradation in long contexts, relevant to prompt design decisions."
    369     },
    370     {
    371       "title": "Automated Program Repair in the Era of Large Pre-trained Language Models",
    372       "authors": ["Xia et al."],
    373       "year": 2023,
    374       "relevance": "LLM-based automated program repair, related to test-and-repair paradigm."
    375     },
    376     {
    377       "title": "Whole Test Suite Generation",
    378       "authors": ["Gordon Fraser", "Andrea Arcuri"],
    379       "year": 2013,
    380       "relevance": "EvoSuite whole test suite generation approach, foundational baseline in test generation."
    381     }
    382   ]
    383 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs