scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (36675B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "From Code Generation to Software Testing: AI Copilot With Context-Based Retrieval-Augmented Generation",
      6     "authors": [
      7       "Yuchen Wang",
      8       "Shangxin Guo",
      9       "Chee Wei Tan"
     10     ],
     11     "year": 2025,
     12     "venue": "IEEE Software",
     13     "arxiv_id": "2504.01866",
     14     "doi": "10.1109/MS.2025.3549628"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The abstract claims of 31.2% improvement in bug detection accuracy, 12.6% increase in critical test coverage, and 10.5% higher acceptance rate are all supported by Table 1 and the user study in Section 5.",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper's core causal claim is that the RAG module improves testing performance. This is supported by a controlled comparison where the only manipulated variable is the presence/absence of the RAG module (proposed vs. baseline), constituting a controlled single-variable manipulation.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper tests on Swift and adapted C++ projects from SIR within Xcode, but the abstract and conclusion make broad claims about 'modern software development practices' and 'the transformative potential of AI-driven technologies' without bounding these to the tested languages, platforms, or project types.",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper attributes improved performance to contextual information and graph-based embeddings but does not consider alternative explanations such as prompt length effects, differences in LLM stochastic outputs, or the possibility that the SIR programs happen to be particularly amenable to context-based approaches.",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper measures suggestion acceptance rate and calls it 'efficiency,' stating 'accepted generations directly imply saved engineering efforts' (Section 5). This conflates acceptance with efficiency without acknowledging that acceptance rate is a proxy — developers may accept poor suggestions, or reject correct ones, and true efficiency involves broader workflow factors.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "There is no dedicated limitations or threats-to-validity section. Section 6 (Future Research Directions) mentions areas for improvement (parameter tuning, user experience, platform expansion) but these are framed as future work, not limitations.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No threats to validity are discussed. The user study feedback about learning curve and slow bulk operations (Section 5.2) are observations, not a structured validity discussion.",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The paper does not explicitly state what the results do NOT show. There is no acknowledgment that results are limited to SIR mutant programs, Xcode/Swift+C++ only, or a specific (unspecified) LLM.",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The Acknowledgment section states: 'This research was supported by the Singapore Ministry of Education Academic Research Fund under Grant RG91/22.'",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are clearly listed: Nanyang Technological University (Singapore) and City University of Hong Kong. Authors are not affiliated with any company whose product is being evaluated.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The Singapore Ministry of Education Academic Research Fund is an independent government funding source with no financial stake in whether the proposed system outperforms baselines.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interests statement is provided. The authors' prior work (Copilot for Xcode) was 're-licensed and assimilated into GitHub,' suggesting potential financial interests that are not disclosed.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "RAG is explained as 'combining retrieval mechanisms with generative models' but key innovation 'context-based RAG' and terms like 'bug detection' are defined primarily through methodology rather than upfront formal definitions.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Main contributions clearly stated as three bullets: (1) RAG methodology with specific improvements, (2) Copilot for Testing system integrated with IDE, (3) user study validation with acceptance rates.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Related work section (Section 2) engages with AI-assisted programming, automated testing, SBSE, and RAG literature; shows how this extends Copilot for Xcode from code generation to testing.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "The paper provides a GitHub URL (https://github.com/intitni/CopilotForXcode) for the base Copilot for Xcode tool, which the testing extension builds upon. However, it is not explicitly stated whether the testing module code is included in this repository.",
    123           "source": "opus"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The evaluation uses the publicly available Software-artifact Infrastructure Repository (SIR) at https://sir.csc.ncsu.edu/. However, the specific curated subset of Swift and adapted C++ projects used is not separately released.",
    129           "source": "opus"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No environment specifications are provided — no requirements.txt, Dockerfile, or detailed dependency list. The paper does not describe what libraries or versions are needed to reproduce the system.",
    135           "source": "opus"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step reproduction instructions are provided. The paper describes the architecture and system flow at a high level but does not include commands, scripts, or a README for reproducing the experiments.",
    141           "source": "opus"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Table 1 reports only point estimates (e.g., 85.3% bug detection accuracy) with no confidence intervals, error bars, or uncertainty quantification on any metric.",
    149           "source": "opus"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "The paper claims the proposed model outperforms the baseline on multiple metrics (e.g., +31.2% bug detection) but performs no statistical significance tests. All comparisons are based solely on comparing two numbers.",
    155           "source": "opus"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Table 1 reports percentage improvements with both baseline and proposed model values, providing context for the magnitude of differences (e.g., bug detection from 54.1% to 85.3%, a 31.2% improvement).",
    161           "source": "opus"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The user study uses only 12 iOS developers with no justification for this sample size and no power analysis. The number of SIR programs and mutants used is also not specified or justified.",
    167           "source": "opus"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No variance, standard deviation, or spread measures are reported for any experiment. Results appear to be from single runs with no indication of result stability.",
    173           "source": "opus"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Table 1 compares the proposed model against a baseline model that does not use the context-based RAG module, providing a direct comparison across multiple metrics.",
    181           "source": "opus"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": false,
    186           "justification": "The only baseline is the authors' own system without RAG. There is no comparison against any external contemporary automated testing tool, LLM-based testing approach, or prior state-of-the-art method.",
    187           "source": "opus"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": false,
    192           "justification": "The system has multiple components (graph-based embeddings with 5 factors, RAG retriever, prompt constructor), but only a single comparison is made (with vs. without the entire RAG module). No systematic ablation shows which individual components (file path, cursor position, bug logs, graph connectivity) contribute to performance.",
    193           "source": "opus"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Table 1 reports six metrics: bug detection accuracy, overall test coverage, critical coverage, cross-file bug detection, execution time per bug, and suggestion acceptance rate.",
    199           "source": "opus"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "Section 5.2 describes a user study with 12 iOS developers who evaluated the tool by completing testing-related tasks and providing qualitative feedback on ease of use, compatibility, and workflow impact.",
    205           "source": "opus"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": false,
    210           "justification": "No discussion of separating dev and test sets. The paper mentions that parameters were configured via 'trial and error' (Section 6) but does not indicate whether tuning and evaluation were performed on separate data.",
    211           "source": "opus"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": false,
    216           "justification": "Results are reported as aggregate numbers in Table 1. There is no breakdown by project type, bug complexity level, programming language (Swift vs. C++), or individual SIR programs.",
    217           "source": "opus"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": false,
    222           "justification": "The paper does not analyze specific failure cases or show examples of bugs the system missed. The subjective evaluation notes performance issues during bulk operations, but no systematic failure analysis is provided.",
    223           "source": "opus"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Table 1 reports a -1.3% decrease in overall test coverage compared to the baseline. The paper discusses this as a deliberate tradeoff favoring critical coverage. The user study also reports a steep learning curve and slower bulk operation response times.",
    229           "source": "opus"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "The paper refers only to 'cloud-based LLMs' throughout. No specific model name, version, provider, or snapshot date is ever mentioned. This is a critical omission — the reader cannot know which model was used.",
    237           "source": "opus"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "Section 4.3 describes the prompt structure (4 components: context system prompt, message history, current question, config system prompt) in natural language but provides no actual prompt text. The reader cannot reconstruct any prompt sent to the model.",
    243           "source": "opus"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "Section 4.3 mentions 'model parameters, temperature, and mode settings' as part of the config system prompt, but no actual values are reported anywhere in the paper.",
    249           "source": "opus"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "The system architecture is described in detail: Figures 1-3 show the RAG retriever with graph-based codebase modeling, the prompt constructor, the system flow, and the feedback loop. Section 4.2 lists the five embedding factors (file path, cursor position, file content, bug logs, graph connectivity) and describes information propagation.",
    255           "source": "opus"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": false,
    260           "justification": "The paper does not document how SIR programs were curated, how Swift projects were selected, how C++ projects were adapted, or any preprocessing steps applied to the evaluation data. The transition from 'curated database' to results is not explained.",
    261           "source": "opus"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "No raw data is provided — neither the specific SIR program outputs, fault-revealing data, coverage logs, nor the user study acceptance/rejection logs are made available.",
    269           "source": "opus"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": false,
    274           "justification": "The paper says it used 'a curated database of Swift and C++ projects from SIR' and followed the 'standardized workflow,' but does not specify which programs were used, how many mutants, how C++ projects were adapted to Swift, or the selection criteria for the curation.",
    275           "source": "opus"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": true,
    279           "answer": false,
    280           "justification": "The user study involves '12 iOS developers' but provides no information on how they were recruited, from what organization, their experience level, or any selection criteria.",
    281           "source": "opus"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": false,
    286           "justification": "The pipeline from raw SIR programs to final metrics in Table 1 is not documented. There is no description of how fault-revealing data was collected, filtered, or aggregated into the reported percentages.",
    287           "source": "opus"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "The paper never specifies which LLM is used, let alone its training data cutoff date. Since the system relies on 'cloud-based LLMs' evaluated on SIR programs, the training cutoff is relevant but unstated.",
    295           "source": "opus"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "SIR programs and their mutants have been publicly available for years. The unspecified LLM could have been trained on this data, but the paper does not discuss potential train/test overlap.",
    301           "source": "opus"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "The Software-artifact Infrastructure Repository has been online since the mid-2000s. Any modern LLM could have seen these programs and their known bugs during training, but contamination risk is not addressed.",
    307           "source": "opus"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": true,
    313           "answer": false,
    314           "justification": "No mention of pre-registration for the user study. No link to OSF, AsPredicted, or any registry.",
    315           "source": "opus"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": true,
    319           "answer": false,
    320           "justification": "No mention of IRB or ethics board approval for the user study involving 12 iOS developers.",
    321           "source": "opus"
    322         },
    323         "demographics_reported": {
    324           "applies": true,
    325           "answer": false,
    326           "justification": "Participants are described only as '12 iOS developers.' No demographics are reported — no experience level, years of experience, gender, or other characterization.",
    327           "source": "opus"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": true,
    331           "answer": false,
    332           "justification": "No inclusion or exclusion criteria are stated for the 12 participants. The paper does not describe who was eligible or how they were selected.",
    333           "source": "opus"
    334         },
    335         "randomization_described": {
    336           "applies": true,
    337           "answer": false,
    338           "justification": "The paper states 'Participants were divided into two groups' but does not describe how the division was done — no mention of randomization procedure, stratification, or assignment method.",
    339           "source": "opus"
    340         },
    341         "blinding_described": {
    342           "applies": true,
    343           "answer": false,
    344           "justification": "No blinding is described. Participants likely knew which version of the tool they were using (proposed vs. baseline), but this is not discussed.",
    345           "source": "opus"
    346         },
    347         "attrition_reported": {
    348           "applies": true,
    349           "answer": false,
    350           "justification": "No information on whether all 12 participants completed the study or whether any dropped out.",
    351           "source": "opus"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": true,
    358           "justification": "Table 1 reports execution time per bug: 0.42 seconds for the proposed model vs. 0.68 seconds for the baseline. However, no API costs or token consumption figures are provided.",
    359           "source": "opus"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "No total computational budget is stated — no GPU hours, total API spend, or hardware specifications for running the experiments.",
    365           "source": "opus"
    366         }
    367       },
    368       "experimental_rigor": {
    369         "seed_sensitivity_reported": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "No mention of multiple random seeds or runs. Results appear to be from single executions with no sensitivity analysis.",
    373           "source": "opus"
    374         },
    375         "number_of_runs_stated": {
    376           "applies": true,
    377           "answer": false,
    378           "justification": "The number of experimental runs is never stated. It is unclear whether results are from a single run or averaged over multiple runs.",
    379           "source": "opus"
    380         },
    381         "hyperparameter_search_budget": {
    382           "applies": true,
    383           "answer": false,
    384           "justification": "Section 6 acknowledges parameters 'are configured based on optimal values determined through trial and error' but provides no details on search budget, number of configurations tried, or search method.",
    385           "source": "opus"
    386         },
    387         "best_config_selection_justified": {
    388           "applies": true,
    389           "answer": false,
    390           "justification": "Parameters were selected via 'trial and error' with no description of selection criteria, validation procedure, or how many configurations were tested. Only the final configuration's results are reported.",
    391           "source": "opus"
    392         },
    393         "multiple_comparison_correction": {
    394           "applies": true,
    395           "answer": false,
    396           "justification": "The paper reports comparisons across 6 metrics without any statistical tests at all, let alone corrections for multiple comparisons.",
    397           "source": "opus"
    398         },
    399         "self_comparison_bias_addressed": {
    400           "applies": true,
    401           "answer": false,
    402           "justification": "The authors evaluate their own system against their own baseline (system without RAG). There is no acknowledgment of self-comparison bias, no independent evaluation, and no external baseline.",
    403           "source": "opus"
    404         },
    405         "compute_budget_vs_performance": {
    406           "applies": true,
    407           "answer": false,
    408           "justification": "The RAG module adds computational overhead (graph updates, embedding propagation, retrieval) compared to the baseline, but no analysis of performance as a function of compute budget is provided.",
    409           "source": "opus"
    410         },
    411         "benchmark_construct_validity": {
    412           "applies": true,
    413           "answer": false,
    414           "justification": "The paper uses SIR programs with artificial mutants but does not discuss whether mutant-based evaluation is a valid proxy for real-world bug detection capability. The validity of mutants as representative of real bugs is a known concern in software testing research.",
    415           "source": "opus"
    416         },
    417         "scaffold_confound_addressed": {
    418           "applies": false,
    419           "answer": false,
    420           "justification": "The RAG module IS the variable being tested — the paper compares the same system with and without the RAG scaffold. The scaffold is the thing being evaluated, not a confound.",
    421           "source": "opus"
    422         }
    423       },
    424       "data_leakage": {
    425         "temporal_leakage_addressed": {
    426           "applies": true,
    427           "answer": false,
    428           "justification": "SIR programs have been publicly available since the mid-2000s. The unspecified LLM was likely trained on data including these programs, but temporal leakage is not discussed.",
    429           "source": "opus"
    430         },
    431         "feature_leakage_addressed": {
    432           "applies": true,
    433           "answer": false,
    434           "justification": "No discussion of whether the evaluation setup leaks information. The RAG module provides the LLM with local code context, but whether this context inadvertently reveals bug locations is not analyzed.",
    435           "source": "opus"
    436         },
    437         "non_independence_addressed": {
    438           "applies": true,
    439           "answer": false,
    440           "justification": "No discussion of whether the SIR programs used for evaluation are independent of the LLM's training data.",
    441           "source": "opus"
    442         },
    443         "leakage_detection_method": {
    444           "applies": true,
    445           "answer": false,
    446           "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, or decontamination is mentioned.",
    447           "source": "opus"
    448         }
    449       }
    450     }
    451   },
    452   "claims": [
    453     {
    454       "claim": "Context-based RAG improves bug detection accuracy by 31.2% over baseline",
    455       "evidence": "Table 1: Proposed Model 85.3%, Baseline Model 54.1%, +31.2% improvement",
    456       "supported": "strong"
    457     },
    458     {
    459       "claim": "Graph-based embeddings enable 32.2% higher detection of cross-file dependency bugs",
    460       "evidence": "Table 1: Cross-File Bug Detection Proposed 81.2% vs Baseline 49.0%, +32.2%",
    461       "supported": "strong"
    462     },
    463     {
    464       "claim": "Critical code path coverage improves by 12.6% with RAG-enhanced testing",
    465       "evidence": "Table 1: Critical Coverage 83.6% vs 71.0%, +12.6%; paper explicitly prioritizes critical paths",
    466       "supported": "strong"
    467     },
    468     {
    469       "claim": "User acceptance of generated test suggestions increases by 10.5% with RAG module",
    470       "evidence": "Table 1: Suggestion Acceptance Rate 31.9% vs 21.4%, +10.5%; corroborated by Section 5.2 user study",
    471       "supported": "strong"
    472     },
    473     {
    474       "claim": "Dynamic adaptation to code changes improves bug detection precision through context refinement",
    475       "evidence": "Section 5.1: 'attributed to its dynamic adaptation to code changes and deep contextual insights'; explanation provided but not independently validated through ablation",
    476       "supported": "moderate"
    477     },
    478     {
    479       "claim": "The framework is platform-agnostic and can be extended to Visual Studio, IntelliJ, and Eclipse with minimal adjustment",
    480       "evidence": "Section 4.5: 'platform-agnostic, relying on modular components'; claims portability but implementation only demonstrated on Xcode",
    481       "supported": "weak"
    482     }
    483   ],
    484   "methodology_tags": [
    485     "benchmark-eval",
    486     "case-study",
    487     "empirical"
    488   ],
    489   "key_findings": "Copilot for Testing, an AI-assisted testing system using context-based Retrieval Augmented Generation with graph-based code embeddings, achieves 31.2% higher bug detection accuracy and 12.6% improvement in critical test coverage compared to a baseline without RAG, according to evaluation on Software-artifact Infrastructure Repository (SIR) synthetic mutation benchmarks. A user study with 12 iOS developers found 10.5% higher acceptance of suggested fixes. The system strategically prioritizes high-impact code paths over exhaustive coverage, balancing practical usability with detection effectiveness. Cross-file dependency bugs show particularly strong improvements (32.2%), suggesting the graph-based context retrieval effectively captures complex code relationships.",
    490   "red_flags": [
    491     {
    492       "flag": "Unspecified LLM model and training cutoff",
    493       "detail": "Paper refers to 'cloud-based LLMs' without naming specific models, versions, or training cutoff dates. Prevents reproducibility, contamination assessment, and prevents independent validation of claims."
    494     },
    495     {
    496       "flag": "No competing interests disclosure despite GitHub connection",
    497       "detail": "Authors acknowledge prior work 're-licensed and assimilated into GitHub Copilot' (commercial product) but include no competing interests or financial disclosure statement."
    498     },
    499     {
    500       "flag": "Baseline poorly characterized",
    501       "detail": "Baseline described as 'model without RAG' but lacks detail on what components differ; insufficient to isolate RAG's contribution from other potential factors."
    502     },
    503     {
    504       "flag": "No statistical significance testing",
    505       "detail": "All results reported as point estimates (85.3%, 31.2%, etc.) with no confidence intervals, error bars, p-values, or evidence of statistical significance."
    506     },
    507     {
    508       "flag": "Small, unrandomized user study without ethics approval",
    509       "detail": "n=12 iOS developers, no randomization described, no IRB/ethics approval mentioned, demographics and attrition not reported. User study lacks rigor for methodological evaluation."
    510     },
    511     {
    512       "flag": "Code and experimental data not released",
    513       "detail": "Neither Copilot for Testing code nor custom adapted datasets made available; evaluation not reproducible. Prior work (Copilot for Xcode) was open-sourced but this tool is not."
    514     },
    515     {
    516       "flag": "No actual prompts provided",
    517       "detail": "Section 4.3 describes prompt structure (System Prompt, Message History, Config) but provides no actual prompt templates, examples, or exact hyperparameter values (temperature, top-p, etc.)."
    518     },
    519     {
    520       "flag": "Limited competitive baselines",
    521       "detail": "No comparison to other AI-assisted testing tools, bug detection systems, or state-of-the-art methods; only ablation (RAG vs. no RAG) provided."
    522     },
    523     {
    524       "flag": "Alternative explanations not explored",
    525       "detail": "Paper attributes improvements to 'dynamic adaptation' and 'contextual insights' but doesn't investigate whether gains come from additional context volume, specific graph topology, embedding quality, or confounds."
    526     },
    527     {
    528       "flag": "Failure cases and negative patterns not analyzed",
    529       "detail": "No systematic analysis of when system fails, false negatives, or edge cases; only brief mentions of slow response during bulk operations."
    530     },
    531     {
    532       "flag": "Generalization claims not validated",
    533       "detail": "Paper claims platform-agnostic framework but only implements/tests on Xcode + Swift/C++. Tested with single language ecosystem (iOS), broad generalization claims unsupported."
    534     },
    535     {
    536       "flag": "Missing data pipeline documentation",
    537       "detail": "No detailed specifications for code-to-embedding conversion, graph construction algorithm, or information propagation mechanics; insufficient for implementation or validation."
    538     }
    539   ],
    540   "cited_papers": [
    541     {
    542       "title": "A multi-year grey literature review on AI-assisted test automation",
    543       "authors": "Ricca et al.",
    544       "year": 2024,
    545       "relevance": "Directly related context on state of AI-assisted testing research and industrial practices"
    546     },
    547     {
    548       "title": "Evaluating large language models trained on code (Codex/HumanEval)",
    549       "authors": "Chen et al.",
    550       "year": 2021,
    551       "relevance": "Foundational work on LLM code generation capabilities and evaluation methodology"
    552     },
    553     {
    554       "title": "Software testing research challenges: An industrial perspective",
    555       "authors": "Alshahwan et al.",
    556       "year": 2023,
    557       "relevance": "Context for real-world testing challenges and priorities that motivate this work"
    558     },
    559     {
    560       "title": "Retrieval Augmented Generation for knowledge-intensive NLP tasks",
    561       "authors": "Lewis et al.",
    562       "year": 2020,
    563       "relevance": "Foundational RAG paper; core technique adapted for code context in this work"
    564     },
    565     {
    566       "title": "Copilot for Xcode: exploring AI-assisted programming by prompting cloud-based large language models",
    567       "authors": "Tan et al.",
    568       "year": 2023,
    569       "relevance": "Prior work by same authors; extends code completion tool to testing domain"
    570     },
    571     {
    572       "title": "Search-Based Software Engineering",
    573       "authors": "Harman & Jones",
    574       "year": 2001,
    575       "relevance": "Foundational SBSE framework used to formulate optimization problem in this paper"
    576     },
    577     {
    578       "title": "Prompting is programming: A query language for large language models",
    579       "authors": "Beurer-Kellner et al.",
    580       "year": 2023,
    581       "relevance": "Relevant to LLM prompt design and optimization for code generation tasks"
    582     },
    583     {
    584       "title": "Defect prediction guided search-based software testing",
    585       "authors": "Perera et al.",
    586       "year": 2020,
    587       "relevance": "Prior work combining ML-guided testing with search-based approaches; related methodology"
    588     },
    589     {
    590       "title": "An empirical evaluation of GitHub Copilot's code suggestions",
    591       "authors": "Nguyen & Nadi",
    592       "year": 2022,
    593       "relevance": "Empirical evaluation of LLM code generation quality, directly relevant to evaluation approach"
    594     },
    595     {
    596       "title": "Aligning crowd-sourced human feedback for reinforcement learning on code generation",
    597       "authors": "Wong & Tan",
    598       "year": 2024,
    599       "relevance": "Reinforcement learning from human feedback on code generation; related to user study methodology"
    600     }
    601   ],
    602   "engagement_factors": {
    603     "practical_relevance": {
    604       "score": 2,
    605       "justification": "Describes a developer tool (Xcode IDE plugin for automated testing), and the base Copilot for Xcode has a public GitHub repo, but the testing extension's availability is unclear."
    606     },
    607     "surprise_contrarian": {
    608       "score": 0,
    609       "justification": "Confirms the expected finding that adding RAG context improves LLM performance — no challenge to conventional wisdom."
    610     },
    611     "fear_safety": {
    612       "score": 0,
    613       "justification": "No AI safety, security, or risk concerns are raised by this work."
    614     },
    615     "drama_conflict": {
    616       "score": 0,
    617       "justification": "No controversy, no challenge to established benchmarks or claims."
    618     },
    619     "demo_ability": {
    620       "score": 1,
    621       "justification": "The base Copilot for Xcode is on GitHub, but the testing module described in this paper is not clearly available for others to try."
    622     },
    623     "brand_recognition": {
    624       "score": 1,
    625       "justification": "Authors are from NTU Singapore and CityU Hong Kong — known institutions but not famous AI labs. The tool echoes 'GitHub Copilot' branding."
    626     }
    627   },
    628   "hn_data": {
    629     "threads": [
    630       {
    631         "hn_id": "44502527",
    632         "title": "Dynamical origin of Theia, the last giant impactor on Earth",
    633         "points": 96,
    634         "comments": 46,
    635         "url": "https://news.ycombinator.com/item?id=44502527"
    636       },
    637       {
    638         "hn_id": "44253021",
    639         "title": "SmartAttack: Air-Gap Attack via Smartwatches",
    640         "points": 18,
    641         "comments": 6,
    642         "url": "https://news.ycombinator.com/item?id=44253021"
    643       },
    644       {
    645         "hn_id": "44494491",
    646         "title": "AsyncFlow: An Asynchronous Streaming RL Framework for LLM Post-Training",
    647         "points": 4,
    648         "comments": 0,
    649         "url": "https://news.ycombinator.com/item?id=44494491"
    650       },
    651       {
    652         "hn_id": "31607482",
    653         "title": "Understanding the Use of Centralized Exchanges for Decentralized Cryptocurrency",
    654         "points": 3,
    655         "comments": 0,
    656         "url": "https://news.ycombinator.com/item?id=31607482"
    657       },
    658       {
    659         "hn_id": "44366937",
    660         "title": "SmartAttack: Air-Gap Attack via Smartwatches",
    661         "points": 2,
    662         "comments": 0,
    663         "url": "https://news.ycombinator.com/item?id=44366937"
    664       },
    665       {
    666         "hn_id": "44254732",
    667         "title": "SmartAttack: Air-Gap Attack via Smartwatches",
    668         "points": 2,
    669         "comments": 0,
    670         "url": "https://news.ycombinator.com/item?id=44254732"
    671       },
    672       {
    673         "hn_id": "43263088",
    674         "title": "Convolutional Multi-Hybrid Language Models",
    675         "points": 2,
    676         "comments": 0,
    677         "url": "https://news.ycombinator.com/item?id=43263088"
    678       },
    679       {
    680         "hn_id": "44667582",
    681         "title": "Frugal Machine Learning for Energy-Efficient, and Resource-Aware AI",
    682         "points": 1,
    683         "comments": 0,
    684         "url": "https://news.ycombinator.com/item?id=44667582"
    685       },
    686       {
    687         "hn_id": "44459390",
    688         "title": "LoRA Fine-Tuning Without GPUs",
    689         "points": 1,
    690         "comments": 0,
    691         "url": "https://news.ycombinator.com/item?id=44459390"
    692       },
    693       {
    694         "hn_id": "43924294",
    695         "title": "Quantum Energy Teleportation Across Multi-Qubit Systems",
    696         "points": 1,
    697         "comments": 0,
    698         "url": "https://news.ycombinator.com/item?id=43924294"
    699       }
    700     ],
    701     "top_points": 96,
    702     "total_points": 130,
    703     "total_comments": 52
    704   }
    705 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs