scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21106B)
      1 {
      2   "paper": {
      3     "title": "CODAMOSA: Escaping Coverage Plateaus in Test Generation with Pre-trained Large Language Models",
      4     "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K. Lahiri", "Siddhartha Sen"],
      5     "year": 2023,
      6     "venue": "ICSE 2023",
      7     "doi": ""
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub repository provided: https://github.com/microsoft/codamosa. Section VII states 'Our source code is available at' with the URL."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Section VII (Data availability) states the repository includes a replication folder with data from the similarity analysis, scripts to generate plots, and information on accessing raw evaluation data including Codex-generated test cases."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Section VII mentions Docker containers for cloning projects, filtering modules, and running CODAMOSA. Built on Pynguin 0.19.0 (Section V-A)."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Section VII describes the replication folder containing: (1) docker container to clone projects and filter modules, (2) docker container to run CODAMOSA, (3) similarity analysis data, (4) scripts to generate plots."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No confidence intervals or error bars reported. Results are presented as counts of benchmarks with significant differences and average coverage values without uncertainty bounds."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section V-B: 'Following Arcuri and Briand's guidelines, we use a Mann-Whitney U-Test to compare the significance of coverage differences between techniques, at p = 0.05.'"
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Effect sizes are reported as absolute percentage point differences with baseline context, e.g., 'average magnitude of the coverage increase is also higher (10% and 9%) than the average magnitude of decreases (−4% and −3%)' in the abstract and Section V-B."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for why 486 benchmarks or 16 runs per benchmark were chosen. The 10-minute time budget is justified by reference to prior work (Pynguin evaluation) but the number of runs is not."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Results report averages across 16 runs but no standard deviations, IQR, or other spread measures are provided for the coverage results."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Two baselines: MOSA (search-based only) and CODEXONLY (LLM only). Additionally, a Union baseline combining MOSA and CODEXONLY test suites is evaluated."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "MOSA is the state-of-the-art SBST algorithm implemented in Pynguin. The paper explains why DynaMOSA was not used (Pynguin 0.19.0 limitation). Codex was the leading publicly available code LLM at time of evaluation."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section V-C evaluates four design decisions individually: uninterpreted statements (CODAMOSA-NOUNINTERP), temperature (CODAMOSA-TEMP-0.2), targeting strategy (CODAMOSA-RANDOM), and prompting (CODAMOSA-TESTCASEPROMPT)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Uses both line coverage and branch coverage (combined as 'line + branch coverage'). Also reports test suite size and number of benchmarks with significant improvements/decreases."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Human evaluation is not relevant here; the paper evaluates automated test generation where code coverage is the appropriate automated metric."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Not a machine learning evaluation paper with train/test splits. The benchmarks are programs to generate tests for, not a test set for a trained model."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Per-benchmark results shown in scatter plots (Fig. 3). Case studies analyze top-20 improvement and all 10 decrease benchmarks individually. Table I provides per-project characteristics."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section V-D2 analyzes all 10 benchmarks where CODAMOSA had lower coverage than MOSA, identifying causes: wrong signature, unparseable constructs, token limitations, and wasted exploration time."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section V-D2 reports coverage decreases on 10 benchmarks. Section V-C reports that test-case prompting was less consistent (49 improvements vs 24 decreases). Uninterpreted statements sometimes cluttered test cases."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims of '173 and 279' benchmarks with higher coverage vs '10 and 4' with lower coverage are directly supported by Section V-B results."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about why CODAMOSA improves coverage are supported by ablation studies (Section V-C) isolating individual components and case studies (Section V-D) analyzing specific mechanisms (special strings, backup callables, uninterpreted statements)."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section VI (Threats to Validity, External) explicitly states 'these results may not hold for any arbitrary Python module.' The paper is clear it evaluates Python only and uses Codex specifically."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section V-E investigates whether Codex is simply memorizing tests from training data. The Union experiment tests whether CODAMOSA is just combining independent results. Section V-D2 discusses wasted time as an alternative explanation for improvements being offset."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper uses 'Codex' throughout without specifying a model version (e.g., code-davinci-002). No API version or snapshot date is provided."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Section IV-B1 describes the exact prompt structure: source code of the module under test, followed by a test function header with specific templates for functions, methods, and constructors. The actual prompt format is given (e.g., '# Unit test for function X\\ndef test_X():')."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section V-A3 reports: maxStallLen=25, max queries to Codex=10, population size=50 (Pynguin default). Temperature=0.8 default, and 200 token completion cutoff (Section V-D2). Search time T=10 minutes, 16 runs."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "CODAMOSA does not use agentic scaffolding. It makes single-turn API calls to Codex within a search algorithm — no retry logic, feedback loops, or agent-style scaffolding."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section V-A2 documents benchmark filtering: 35 projects identified, pipreqs for dependencies, preliminary MOSA runs to remove failures and 100% coverage modules, down-sampling of shared parent modules, resulting in 486 modules from 27 projects."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section VI 'Threats to Validity' is a dedicated section discussing internal, external, and construct validity threats."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section VI provides specific threats: potential deserialization bugs (internal), results may not generalize to arbitrary Python modules (external), and coverage may not correlate with bug-finding ability (construct), citing a specific study [42]."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section VI (External) explicitly states results may not hold for arbitrary Python modules. Section VII discusses that LLMs are 'not the solution to every problem in test case generation.' The evaluation is explicitly scoped to Python with Codex."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section VII states the replication folder contains raw evaluation data and Codex-generated test cases that can replay CODAMOSA runs."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section V-A2 describes benchmark collection from Pynguin and BugsInPy sources, with specific project revisions listed in Table I. Section V-A4 describes experimental procedure (16 runs, 10 minutes each)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Benchmarks are software modules from existing projects."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section V-A2 documents the full pipeline: 35 projects → dependency identification → module extraction → preliminary MOSA filtering → down-sampling → 486 benchmarks from 27 projects. Filtering criteria are stated."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section found in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: Caroline Lemieux at University of British Columbia, three co-authors at Microsoft Research. The footnote notes 'Most implementation/evaluation work conducted at Microsoft Research.'"
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Three of four authors are from Microsoft Research, and the paper uses Microsoft's Codex (OpenAI, which Microsoft has invested in heavily). Microsoft has a financial interest in demonstrating the utility of LLMs for software engineering. No discussion of this potential conflict."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement found in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper does not state Codex's training data cutoff date. It mentions Codex was trained on '55 million GitHub repositories' but not when this data was collected."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Section V-E directly investigates whether Codex generations are copied from training data by comparing to out-of-prompt test files. They also test on flutils (hosted on GitLab, unlikely in Codex's GitHub training set) as a contamination control."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Section V-E addresses this with edit distance analysis showing most generated tests have low similarity to existing tests (99.1% of high-similarity cases are just 'pass'). The flutils GitLab experiment provides evidence CODAMOSA works on code unlikely in training data."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Section V-D2 reports: 'On average over all benchmarks, CODAMOSA issues 60 queries to Codex and spends 413 seconds waiting for these queries. This rises to 480 seconds on the worst-performing benchmarks.' Time cost per query is implicitly reported."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total compute budget stated. The paper reports 486 benchmarks x 16 runs x multiple configurations but does not state total API cost, GPU hours, or wall-clock time for the full evaluation."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CODAMOSA achieves statistically significantly higher coverage on 173 benchmarks compared to MOSA, while reducing coverage on only 10.",
    286       "evidence": "Section V-B, Fig. 3a. Mann-Whitney U-test at p=0.05 across 486 benchmarks with 16 runs each.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "CODAMOSA achieves significantly higher coverage on 279 benchmarks compared to CODEXONLY, while reducing coverage on only 4.",
    291       "evidence": "Section V-B, Fig. 3b. Same statistical methodology.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "CODAMOSA outperforms the union of MOSA and CODEXONLY test suites on 17% of benchmarks despite having half the search time.",
    296       "evidence": "Section V-B. On 429 replayable benchmarks, CODAMOSA was significantly better on 72, worse on 43, same on 314.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Higher temperature (0.8) sampling is the most consistently positive design decision.",
    301       "evidence": "Section V-C2, Fig. 3d. Temperature 0.8 achieves significantly higher coverage on 113 benchmarks vs 9 for temperature 0.2.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Codex-generated tests are mostly not copied from out-of-prompt code in the repositories.",
    306       "evidence": "Section V-E, Fig. 4. Majority of generated tests have similarity ≤0.4. 99.1% of high-similarity cases are the statement 'pass'.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "CODAMOSA combines search-based software testing (MOSA) with LLM-generated test case hints from Codex to escape coverage plateaus. On 486 Python benchmarks with 16 runs each, CODAMOSA achieves significantly higher coverage than MOSA on 173 benchmarks and than a Codex-only baseline on 279 benchmarks, with few regressions (10 and 4 respectively). Ablation studies show higher sampling temperature, uninterpreted statements, and low-coverage targeting all contribute, with temperature having the most consistent effect. Coverage decreases are primarily caused by wasted time on Codex queries that produce unparseable or unhelpful output.",
    312   "red_flags": [
    313     {
    314       "flag": "No variance reporting",
    315       "detail": "Despite 16 runs per benchmark, no standard deviations or other spread measures are reported for coverage results. Only averages and significance test counts are provided."
    316     },
    317     {
    318       "flag": "Codex model version unspecified",
    319       "detail": "The paper uses 'Codex' without specifying which model version (e.g., code-davinci-001 vs code-davinci-002), making exact replication impossible as the Codex API has since been deprecated."
    320     }
    321   ],
    322   "cited_papers": [
    323     {
    324       "title": "Evaluating Large Language Models Trained on Code",
    325       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    326       "year": 2021,
    327       "arxiv_id": "2107.03374",
    328       "relevance": "Introduces Codex, the LLM used in CODAMOSA for test generation, foundational to LLM-based code generation evaluation."
    329     },
    330     {
    331       "title": "Pynguin: Automated Unit Test Generation for Python",
    332       "authors": ["Stephan Lukasczyk", "G. Fraser"],
    333       "year": 2022,
    334       "arxiv_id": "2202.05218",
    335       "relevance": "The SBST framework CODAMOSA is built on; key tool for automated Python test generation."
    336     },
    337     {
    338       "title": "Can OpenAI Codex and Other Large Language Models Help Us Fix Security Bugs?",
    339       "authors": ["H. Pearce", "B. Tan", "B. Ahmad", "R. Karri", "B. Dolan-Gavitt"],
    340       "year": 2021,
    341       "arxiv_id": "2112.02125",
    342       "relevance": "Evaluates LLMs for automated program repair of security bugs, directly relevant to LLM code quality assessment."
    343     },
    344     {
    345       "title": "Automatic Program Repair with OpenAI's Codex: Evaluating QuixBugs",
    346       "authors": ["J. A. Prenner", "R. Robbes"],
    347       "year": 2021,
    348       "arxiv_id": "2111.03922",
    349       "relevance": "Evaluates LLM-based automated program repair, relevant to understanding LLM capabilities in software engineering."
    350     },
    351     {
    352       "title": "Code Generation Tools (Almost) for Free? A Study of Few-Shot, Pre-Trained Language Models on Code",
    353       "authors": ["P. Bareiß", "B. Souza", "M. d'Amorim", "M. Pradel"],
    354       "year": 2022,
    355       "relevance": "Compares Codex to traditional test generation tools (Randoop) on code coverage, directly comparable work."
    356     },
    357     {
    358       "title": "Jigsaw: Large Language Models Meet Program Synthesis",
    359       "authors": ["N. Jain", "S. Vaidyanath", "A. Iyer"],
    360       "year": 2022,
    361       "relevance": "Uses LLMs for program synthesis with test-case-based post-processing, relevant to LLM+testing integration approaches."
    362     },
    363     {
    364       "title": "TOGA: A Neural Method for Test Oracle Generation",
    365       "authors": ["E. Dinella", "G. Ryan", "T. Mytkowicz", "S. Lahiri"],
    366       "year": 2022,
    367       "relevance": "Neural approach to test oracle generation, complementary to CODAMOSA's test case generation."
    368     },
    369     {
    370       "title": "Productivity assessment of neural code completion",
    371       "authors": ["A. Ziegler", "E. Kalliamvakou", "X. A. Li"],
    372       "year": 2022,
    373       "relevance": "Assesses productivity impact of neural code completion (Copilot), relevant to evaluating LLM tools in software engineering."
    374     },
    375     {
    376       "title": "A Practical Guide for Using Statistical Tests to Assess Randomized Algorithms in Software Engineering",
    377       "authors": ["A. Arcuri", "L. Briand"],
    378       "year": 2011,
    379       "relevance": "Methodological guidelines followed in CODAMOSA's evaluation; important reference for statistical rigor in SE experiments."
    380     },
    381     {
    382       "title": "Competition-level code generation with AlphaCode",
    383       "authors": ["Y. Li", "D. Choi", "J. Chung"],
    384       "year": 2022,
    385       "arxiv_id": "2203.07814",
    386       "relevance": "Major LLM code generation system, relevant to understanding LLM code capabilities."
    387     }
    388   ]
    389 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs