ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (30452B)


      1 {
      2   "paper": {
      3     "title": "From Code Generation to Software Testing: AI Copilot With Context-Based Retrieval-Augmented Generation",
      4     "authors": [
      5       "Yuchen Wang",
      6       "Shangxin Guo",
      7       "Chee Wei Tan"
      8     ],
      9     "year": 2025,
     10     "venue": "IEEE Software",
     11     "arxiv_id": "2504.01866",
     12     "doi": "10.1109/MS.2025.3549628"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval", "case-study"],
     17   "key_findings": "The paper proposes Copilot for Testing, an AI-assisted testing system using context-based RAG that models codebases as graphs with dynamically updated embeddings. Evaluated on SIR (Software-artifact Infrastructure Repository) programs, the system reports a 31.2% improvement in bug detection accuracy and 12.6% increase in critical test coverage compared to a baseline without RAG. A user study with 12 iOS developers showed a 10.5% higher suggestion acceptance rate, though no statistical tests were performed on any results.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper provides a GitHub URL (https://github.com/intitni/CopilotForXcode) for the base Copilot for Xcode tool, which the testing extension builds upon. However, it is not explicitly stated whether the testing module code is included in this repository."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The evaluation uses the publicly available Software-artifact Infrastructure Repository (SIR) at https://sir.csc.ncsu.edu/. However, the specific curated subset of Swift and adapted C++ projects used is not separately released."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No environment specifications are provided — no requirements.txt, Dockerfile, or detailed dependency list. The paper does not describe what libraries or versions are needed to reproduce the system."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are provided. The paper describes the architecture and system flow at a high level but does not include commands, scripts, or a README for reproducing the experiments."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Table 1 reports only point estimates (e.g., 85.3% bug detection accuracy) with no confidence intervals, error bars, or uncertainty quantification on any metric."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims the proposed model outperforms the baseline on multiple metrics (e.g., +31.2% bug detection) but performs no statistical significance tests. All comparisons are based solely on comparing two numbers."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Table 1 reports percentage improvements with both baseline and proposed model values, providing context for the magnitude of differences (e.g., bug detection from 54.1% to 85.3%, a 31.2% improvement)."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The user study uses only 12 iOS developers with no justification for this sample size and no power analysis. The number of SIR programs and mutants used is also not specified or justified."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance, standard deviation, or spread measures are reported for any experiment. Results appear to be from single runs with no indication of result stability."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Table 1 compares the proposed model against a baseline model that does not use the context-based RAG module, providing a direct comparison across multiple metrics."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "The only baseline is the authors' own system without RAG. There is no comparison against any external contemporary automated testing tool, LLM-based testing approach, or prior state-of-the-art method."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "The system has multiple components (graph-based embeddings with 5 factors, RAG retriever, prompt constructor), but only a single comparison is made (with vs. without the entire RAG module). No systematic ablation shows which individual components (file path, cursor position, bug logs, graph connectivity) contribute to performance."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Table 1 reports six metrics: bug detection accuracy, overall test coverage, critical coverage, cross-file bug detection, execution time per bug, and suggestion acceptance rate."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Section 5.2 describes a user study with 12 iOS developers who evaluated the tool by completing testing-related tasks and providing qualitative feedback on ease of use, compatibility, and workflow impact."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No discussion of separating dev and test sets. The paper mentions that parameters were configured via 'trial and error' (Section 6) but does not indicate whether tuning and evaluation were performed on separate data."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "Results are reported as aggregate numbers in Table 1. There is no breakdown by project type, bug complexity level, programming language (Swift vs. C++), or individual SIR programs."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "The paper does not analyze specific failure cases or show examples of bugs the system missed. The subjective evaluation notes performance issues during bulk operations, but no systematic failure analysis is provided."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Table 1 reports a -1.3% decrease in overall test coverage compared to the baseline. The paper discusses this as a deliberate tradeoff favoring critical coverage. The user study also reports a steep learning curve and slower bulk operation response times."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims of 31.2% improvement in bug detection accuracy, 12.6% increase in critical test coverage, and 10.5% higher acceptance rate are all supported by Table 1 and the user study in Section 5."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper's core causal claim is that the RAG module improves testing performance. This is supported by a controlled comparison where the only manipulated variable is the presence/absence of the RAG module (proposed vs. baseline), constituting a controlled single-variable manipulation."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper tests on Swift and adapted C++ projects from SIR within Xcode, but the abstract and conclusion make broad claims about 'modern software development practices' and 'the transformative potential of AI-driven technologies' without bounding these to the tested languages, platforms, or project types."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper attributes improved performance to contextual information and graph-based embeddings but does not consider alternative explanations such as prompt length effects, differences in LLM stochastic outputs, or the possibility that the SIR programs happen to be particularly amenable to context-based approaches."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper measures suggestion acceptance rate and calls it 'efficiency,' stating 'accepted generations directly imply saved engineering efforts' (Section 5). This conflates acceptance with efficiency without acknowledging that acceptance rate is a proxy — developers may accept poor suggestions, or reject correct ones, and true efficiency involves broader workflow factors."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The paper refers only to 'cloud-based LLMs' throughout. No specific model name, version, provider, or snapshot date is ever mentioned. This is a critical omission — the reader cannot know which model was used."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Section 4.3 describes the prompt structure (4 components: context system prompt, message history, current question, config system prompt) in natural language but provides no actual prompt text. The reader cannot reconstruct any prompt sent to the model."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "Section 4.3 mentions 'model parameters, temperature, and mode settings' as part of the config system prompt, but no actual values are reported anywhere in the paper."
    157       },
    158       "scaffolding_described": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The system architecture is described in detail: Figures 1-3 show the RAG retriever with graph-based codebase modeling, the prompt constructor, the system flow, and the feedback loop. Section 4.2 lists the five embedding factors (file path, cursor position, file content, bug logs, graph connectivity) and describes information propagation."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "The paper does not document how SIR programs were curated, how Swift projects were selected, how C++ projects were adapted, or any preprocessing steps applied to the evaluation data. The transition from 'curated database' to results is not explained."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "There is no dedicated limitations or threats-to-validity section. Section 6 (Future Research Directions) mentions areas for improvement (parameter tuning, user experience, platform expansion) but these are framed as future work, not limitations."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No threats to validity are discussed. The user study feedback about learning curve and slow bulk operations (Section 5.2) are observations, not a structured validity discussion."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "The paper does not explicitly state what the results do NOT show. There is no acknowledgment that results are limited to SIR mutant programs, Xcode/Swift+C++ only, or a specific (unspecified) LLM."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No raw data is provided — neither the specific SIR program outputs, fault-revealing data, coverage logs, nor the user study acceptance/rejection logs are made available."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "The paper says it used 'a curated database of Swift and C++ projects from SIR' and followed the 'standardized workflow,' but does not specify which programs were used, how many mutants, how C++ projects were adapted to Swift, or the selection criteria for the curation."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The user study involves '12 iOS developers' but provides no information on how they were recruited, from what organization, their experience level, or any selection criteria."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "The pipeline from raw SIR programs to final metrics in Table 1 is not documented. There is no description of how fault-revealing data was collected, filtered, or aggregated into the reported percentages."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The Acknowledgment section states: 'This research was supported by the Singapore Ministry of Education Academic Research Fund under Grant RG91/22.'"
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly listed: Nanyang Technological University (Singapore) and City University of Hong Kong. Authors are not affiliated with any company whose product is being evaluated."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "The Singapore Ministry of Education Academic Research Fund is an independent government funding source with no financial stake in whether the proposed system outperforms baselines."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is provided. The authors' prior work (Copilot for Xcode) was 're-licensed and assimilated into GitHub,' suggesting potential financial interests that are not disclosed."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The paper never specifies which LLM is used, let alone its training data cutoff date. Since the system relies on 'cloud-based LLMs' evaluated on SIR programs, the training cutoff is relevant but unstated."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "SIR programs and their mutants have been publicly available for years. The unspecified LLM could have been trained on this data, but the paper does not discuss potential train/test overlap."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "The Software-artifact Infrastructure Repository has been online since the mid-2000s. Any modern LLM could have seen these programs and their known bugs during training, but contamination risk is not addressed."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No mention of pre-registration for the user study. No link to OSF, AsPredicted, or any registry."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": true,
    255         "answer": false,
    256         "justification": "No mention of IRB or ethics board approval for the user study involving 12 iOS developers."
    257       },
    258       "demographics_reported": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "Participants are described only as '12 iOS developers.' No demographics are reported — no experience level, years of experience, gender, or other characterization."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": true,
    265         "answer": false,
    266         "justification": "No inclusion or exclusion criteria are stated for the 12 participants. The paper does not describe who was eligible or how they were selected."
    267       },
    268       "randomization_described": {
    269         "applies": true,
    270         "answer": false,
    271         "justification": "The paper states 'Participants were divided into two groups' but does not describe how the division was done — no mention of randomization procedure, stratification, or assignment method."
    272       },
    273       "blinding_described": {
    274         "applies": true,
    275         "answer": false,
    276         "justification": "No blinding is described. Participants likely knew which version of the tool they were using (proposed vs. baseline), but this is not discussed."
    277       },
    278       "attrition_reported": {
    279         "applies": true,
    280         "answer": false,
    281         "justification": "No information on whether all 12 participants completed the study or whether any dropped out."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Table 1 reports execution time per bug: 0.42 seconds for the proposed model vs. 0.68 seconds for the baseline. However, no API costs or token consumption figures are provided."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No total computational budget is stated — no GPU hours, total API spend, or hardware specifications for running the experiments."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No mention of multiple random seeds or runs. Results appear to be from single executions with no sensitivity analysis."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The number of experimental runs is never stated. It is unclear whether results are from a single run or averaged over multiple runs."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Section 6 acknowledges parameters 'are configured based on optimal values determined through trial and error' but provides no details on search budget, number of configurations tried, or search method."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Parameters were selected via 'trial and error' with no description of selection criteria, validation procedure, or how many configurations were tested. Only the final configuration's results are reported."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The paper reports comparisons across 6 metrics without any statistical tests at all, let alone corrections for multiple comparisons."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors evaluate their own system against their own baseline (system without RAG). There is no acknowledgment of self-comparison bias, no independent evaluation, and no external baseline."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The RAG module adds computational overhead (graph updates, embedding propagation, retrieval) compared to the baseline, but no analysis of performance as a function of compute budget is provided."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper uses SIR programs with artificial mutants but does not discuss whether mutant-based evaluation is a valid proxy for real-world bug detection capability. The validity of mutants as representative of real bugs is a known concern in software testing research."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "The RAG module IS the variable being tested — the paper compares the same system with and without the RAG scaffold. The scaffold is the thing being evaluated, not a confound."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "SIR programs have been publicly available since the mid-2000s. The unspecified LLM was likely trained on data including these programs, but temporal leakage is not discussed."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the evaluation setup leaks information. The RAG module provides the LLM with local code context, but whether this context inadvertently reveals bug locations is not analyzed."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether the SIR programs used for evaluation are independent of the LLM's training data."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, or decontamination is mentioned."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "The proposed testing methodology achieves a 31.2% higher bug detection rate compared to the baseline model (85.3% vs. 54.1%).",
    369       "evidence": "Table 1, Section 5.1. Evaluated on SIR programs with known mutants. No error bars, confidence intervals, or significance tests accompany this result.",
    370       "supported": "weak"
    371     },
    372     {
    373       "claim": "Critical test coverage increased by 12.6% (83.6% vs. 71.0%) while overall test coverage slightly decreased by 1.3%.",
    374       "evidence": "Table 1, Section 5.1. Critical coverage defined using graph node embeddings to identify important code paths. No statistical testing or variance reporting.",
    375       "supported": "weak"
    376     },
    377     {
    378       "claim": "Cross-file bug detection improved by 32.2% (81.2% vs. 49.0%).",
    379       "evidence": "Table 1, Section 5.1. No details on what constitutes cross-file bugs, how many were in the dataset, or how detection was measured.",
    380       "supported": "weak"
    381     },
    382     {
    383       "claim": "The proposed model achieved a 10.5% higher suggestion acceptance rate (31.9% vs. 21.4%) in user studies.",
    384       "evidence": "Table 1, Section 5.2. Based on a user study with 12 iOS developers divided into test and control groups. No statistical test, no details on number of suggestions, no confidence intervals.",
    385       "supported": "weak"
    386     },
    387     {
    388       "claim": "The approach reduces execution time per detected bug from 0.68 to 0.42 seconds.",
    389       "evidence": "Table 1, Section 5.1. Single point estimate with no variance or repeated measurements reported.",
    390       "supported": "weak"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "Unspecified LLM",
    396       "detail": "The paper never identifies which LLM is used — only 'cloud-based LLMs.' This is a critical omission that makes the work impossible to reproduce. Model choice and version drastically affect results in LLM-based systems."
    397     },
    398     {
    399       "flag": "No statistical tests on any result",
    400       "detail": "All comparisons in Table 1 are based on comparing two point estimates with no significance tests, confidence intervals, or variance measures. With unknown sample sizes and single-run results, none of the claimed improvements can be distinguished from noise."
    401     },
    402     {
    403       "flag": "Tiny uncharacterized user study",
    404       "detail": "The user study uses only 12 developers (6 per group) with no demographics, no recruitment description, no randomization procedure, no IRB approval, and no statistical analysis. This sample size is too small for reliable quantitative conclusions."
    405     },
    406     {
    407       "flag": "No external baselines",
    408       "detail": "The only comparison is against the authors' own system without RAG. No comparison against any existing automated testing tool, LLM-based testing approach, or prior work, making it impossible to assess the system's relative standing."
    409     },
    410     {
    411       "flag": "Suspiciously large improvements without uncertainty quantification",
    412       "detail": "A 31.2% improvement in bug detection accuracy from adding RAG is an extraordinarily large effect. Combined with no error bars, no repeated runs, and no significance tests, these results should be viewed with skepticism."
    413     },
    414     {
    415       "flag": "Undisclosed evaluation details",
    416       "detail": "The number of SIR programs, number of mutants, how C++ projects were adapted to Swift, and how 'critical coverage' importance weights were determined are all unspecified, preventing any assessment of the evaluation's validity."
    417     },
    418     {
    419       "flag": "Potential data contamination",
    420       "detail": "SIR programs have been publicly available since the mid-2000s. The unspecified LLM could have memorized these programs and their known bugs during training, potentially inflating bug detection rates. This is never discussed."
    421     }
    422   ],
    423   "cited_papers": [
    424     {
    425       "title": "Evaluating large language models trained on code",
    426       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    427       "year": 2021,
    428       "arxiv_id": "2107.03374",
    429       "relevance": "Foundational paper on LLM code generation evaluation (Codex/HumanEval), directly relevant to the survey's scope of AI/LLM code generation capability assessment."
    430     },
    431     {
    432       "title": "An empirical evaluation of GitHub Copilot's code suggestions",
    433       "authors": ["Nhan Nguyen", "Sarah Nadi"],
    434       "year": 2022,
    435       "relevance": "Empirical study evaluating GitHub Copilot's code suggestion quality, relevant to AI-assisted programming tool assessment."
    436     },
    437     {
    438       "title": "Aligning crowd-sourced human feedback for reinforcement learning on code generation by large language models",
    439       "authors": ["Man Fai Wong", "Chee Wei Tan"],
    440       "year": 2024,
    441       "relevance": "Studies RLHF for LLM code generation, relevant to the survey's scope of LLM programming improvement methods."
    442     },
    443     {
    444       "title": "An initial investigation of ChatGPT unit test generation capability",
    445       "authors": ["Vitor Guilherme", "Auri Vincenzi"],
    446       "year": 2023,
    447       "relevance": "Directly evaluates ChatGPT's ability to generate unit tests, central to the survey's coverage of AI-assisted testing."
    448     },
    449     {
    450       "title": "Software testing research challenges: An industrial perspective",
    451       "authors": ["Nadia Alshahwan", "Mark Harman", "Alexandru Marginean"],
    452       "year": 2023,
    453       "relevance": "Industrial perspective on software testing challenges including AI-driven approaches, relevant to understanding the testing landscape."
    454     },
    455     {
    456       "title": "Prompting is programming: A query language for large language models",
    457       "authors": ["Luca Beurer-Kellner", "Marc Fischer", "Martin Vechev"],
    458       "year": 2023,
    459       "relevance": "Foundational work on prompting methodology for LLMs, relevant to the survey's coverage of LLM interaction paradigms."
    460     },
    461     {
    462       "title": "A multi-year grey literature review on AI-assisted test automation",
    463       "authors": ["Filippo Ricca", "Alessandro Marchetto", "Andrea Stocco"],
    464       "year": 2024,
    465       "arxiv_id": "2408.06224",
    466       "relevance": "Systematic review of AI-assisted test automation literature, directly relevant as a survey of the same field."
    467     },
    468     {
    469       "title": "Copilot for Xcode: exploring AI-assisted programming by prompting cloud-based large language models",
    470       "authors": ["Chee Wei Tan", "Shangxin Guo", "Man Fai Wong", "Ching Nam Hang"],
    471       "year": 2023,
    472       "arxiv_id": "2307.14349",
    473       "relevance": "Prior work by the same authors on AI-assisted programming via cloud LLMs, the foundation for the testing extension evaluated here."
    474     },
    475     {
    476       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    477       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"],
    478       "year": 2020,
    479       "relevance": "Foundational RAG paper — the core technique applied in this work for enhancing LLM-based software testing."
    480     },
    481     {
    482       "title": "Navigating the complexity of generative AI adoption in software engineering",
    483       "authors": ["Daniel Russo"],
    484       "year": 2024,
    485       "relevance": "Studies generative AI adoption in software engineering, relevant to understanding the broader context of AI-assisted development tools."
    486     }
    487   ],
    488   "engagement_factors": {
    489     "practical_relevance": {
    490       "score": 2,
    491       "justification": "Describes a developer tool (Xcode IDE plugin for automated testing), and the base Copilot for Xcode has a public GitHub repo, but the testing extension's availability is unclear."
    492     },
    493     "surprise_contrarian": {
    494       "score": 0,
    495       "justification": "Confirms the expected finding that adding RAG context improves LLM performance — no challenge to conventional wisdom."
    496     },
    497     "fear_safety": {
    498       "score": 0,
    499       "justification": "No AI safety, security, or risk concerns are raised by this work."
    500     },
    501     "drama_conflict": {
    502       "score": 0,
    503       "justification": "No controversy, no challenge to established benchmarks or claims."
    504     },
    505     "demo_ability": {
    506       "score": 1,
    507       "justification": "The base Copilot for Xcode is on GitHub, but the testing module described in this paper is not clearly available for others to try."
    508     },
    509     "brand_recognition": {
    510       "score": 1,
    511       "justification": "Authors are from NTU Singapore and CityU Hong Kong — known institutions but not famous AI labs. The tool echoes 'GitHub Copilot' branding."
    512     }
    513   }
    514 }

Impressum · Datenschutz