scan.json (31618B)
1 { 2 "paper": { 3 "title": "PromptPex: Automatic Test Generation for Language Model Prompts", 4 "authors": [ 5 "Reshabh K Sharma", 6 "Jonathan de Halleux", 7 "Shraddha Barke", 8 "Dan Grossman", 9 "Benjamin Zorn" 10 ], 11 "year": 2026, 12 "venue": "arXiv", 13 "arxiv_id": "2503.05070", 14 "doi": "10.48550/arXiv.2503.05070" 15 }, 16 "scan_version": 3, 17 "active_modules": ["experimental_rigor", "data_leakage"], 18 "methodology_tags": ["benchmark-eval"], 19 "key_findings": "PromptPex, an LLM-based tool for automatic test generation for prompts, consistently generates tests that cause more non-compliant model outputs than a zero-shot baseline across all four models tested (gpt-oss, gemma2:9b, qwen2.5:3b, llama3.2:1b). Inverse rules further increase non-compliance rates compared to direct rules, especially for the strongest model. Generated tests are nearly all valid per the extracted input specification, and the extracted output rules show 89% groundedness and 96.8% spec agreement with the original prompts.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "The paper states 'The source code of PromptPex is available at https://anonymous.4open.science/r/prompttest-83ED' in both the abstract and Section 9 (Data Availability). An anonymous repository link is provided." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "Section 9 states 'The samples directory contains the benchmarks used in the evaluation, and the eval directory contains the evaluation results as well as all artifacts generated during the evaluation' in the anonymous repository." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "Section 4.1.4 describes hardware (AMD EPYC 7V13, 220GB RAM, 4 NVIDIA A100 GPUs, Ubuntu 24.04.3 LTS) and mentions Ollama for local models, but no requirements.txt, Dockerfile, library versions, or software environment specification is provided in the paper." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper describes the pipeline conceptually (Section 3, Figure 6) but does not provide step-by-step reproduction instructions, commands to run, or a reproducibility guide." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "All results are reported as point estimates (percentages in Figures 9-13). No confidence intervals, error bars, or uncertainty measures are provided." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper claims PromptPex generates more non-compliant tests than baseline by comparing raw percentages across benchmarks. No statistical significance tests (t-tests, Wilcoxon, etc.) are used to validate these differences." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "Figures 9-11 show the actual non-compliance percentages for both PromptPex and baseline across all models and benchmarks, providing sufficient context to assess the magnitude of differences (e.g., PromptPex vs baseline rates visible per-model in Figure 9)." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "The evaluation uses 22 benchmark prompts and 4 models. No justification is given for why 22 prompts or these specific 4 models were sufficient, and no power analysis is discussed." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "Section 4.1.4 explicitly states 'We ran each test once per prompt per model.' Single-run results with no variance, standard deviation, or repeated trials across seeds." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "A zero-shot LLM-based test generator using the same gpt-5_2025-08-07 model serves as the baseline (Section 4.1.1). Additionally, Section 5.5 compares against a property-based testing approach using the Hypothesis framework." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "The baseline uses the same state-of-the-art model (gpt-5_2025-08-07) as PromptPex, representing a contemporary LLM-based test generation approach. The authors note this is reasonable since 'LLMs are already useful in generating unit tests for traditional software' and refined the baseline prompt through multiple revisions." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "Q2 (Section 4.2) compares tests from rules vs inverse rules (Figure 10). Q3 evaluates the effect of the input specification on test validity (Figure 12). These isolate the contribution of individual PromptPex components." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Section 4.1.2 defines four metrics: % non-compliance, test validity, groundedness of output rules, and spec agreement. Results are reported for all four." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": false, 94 "justification": "All evaluation is automated using LLM-as-a-judge (o4-mini_2025-04-16). No human evaluation of test quality, compliance judgments, or system outputs is reported." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": false, 99 "justification": "There is no separation between development and test benchmarks. All 22 benchmarks are used for both developing the approach and reporting final results. No held-out set is mentioned." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Figure 11 shows per-benchmark results for baseline, rule-based, and inverse-rule tests on gpt-oss. Figure 13 shows per-benchmark non-compliance for all four models. Detailed per-prompt breakdowns are provided." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section 5 discusses multiple failure cases: the classify prompt's non-grounded rules, the art prompt's ambiguous IS/OR, Shakespeare prompt's low spec agreement, and Section 5.3 discusses limitations of inverse rules being bounded by IS." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Section 5.5 discusses where the symbolic property-based testing approach fails. The classify prompt is noted as an exception with low groundedness. Shakespeare is an exception for spec agreement. Section 7 discusses multiple limitations including formatting issues and self-bias." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The abstract claims PromptPex 'consistently creates tests that result in more invalid model outputs than a carefully constructed baseline' — supported by Figure 9 showing higher non-compliance across all four models. The claim about 'extracting concrete specifications' is supported by the groundedness and spec agreement analysis." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "The paper makes causal claims that specification extraction leads to better tests. The study design uses controlled comparisons: same model (gpt-5_2025-08-07) for both PromptPex and baseline, same benchmarks, same evaluation. The ablation comparing rules vs inverse rules (Figure 10) is a controlled single-variable manipulation." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The title 'Automatic Test Generation for Language Model Prompts' is broad, and Contribution 1 claims 'Our work is the first to focus on the specific problem of automated test generation for prompts.' However, results are limited to 22 single-input prompts on 4 models. Section 4.1.3 acknowledges single-input limitation but the title and contributions generalize beyond the tested setting." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper does not discuss alternative explanations for why PromptPex outperforms baseline. For example, the multi-stage pipeline makes more LLM calls, the specification extraction provides additional context, or the structured approach inherently generates more diverse tests. Section 7 discusses limitations but not alternative explanations for the observed results." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "Section 4.1.2 explicitly defines 'We define effective tests as those that expose limitations in the prompt, which means that they result in more failures. We consider non-compliance with the prompt as the metric for test quality.' The paper also distinguishes compliance from correctness (Section 3.4) and acknowledges that 'our evaluation explores only a partial understanding of the model output.'" 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": true, 148 "justification": "Section 4.1.4 specifies exact model versions: 'gpt-5_2025-08-07' and 'o4-mini_2025-04-16' (with dates), and local models with sizes: 'gemma2:9b, qwen2.5:3b, llama3.2:1b'. The API models include snapshot dates." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": false, 153 "justification": "The internal prompts used by PromptPex (for IS extraction, OR extraction, inverse rule generation, test generation, test evaluation) are described functionally in natural language (Sections 3.1-3.4) but the actual prompt text is not provided. E.g., 'We frame it as a task to extract the IS to create valid inputs' without giving the actual prompt. Only the POS benchmark prompt example is shown." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section 4.1.4 states 'We kept the temperature 1.0 across all the requests.' Temperature is the primary hyperparameter for LLM generation." 159 }, 160 "scaffolding_described": { 161 "applies": true, 162 "answer": true, 163 "justification": "The PromptPex pipeline is described in detail in Section 3 with Figure 6 showing the end-to-end workflow: IS extraction → OR extraction → inverse rule generation → test generation → test execution on multiple models → test evaluation via LLM-as-a-judge. Each stage is explained." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": false, 168 "justification": "Section 4.1.3 states 'We selected a diverse set of prompts from publicly available sources, focusing on those that are within the scope of PromptPex' but does not describe how many prompts were initially considered, the selection pipeline, or filtering criteria beyond scope constraints." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": true, 175 "justification": "Section 7 is titled 'Limitations' and provides substantive discussion of multiple limitations across approximately one page." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": true, 180 "justification": "Section 7 discusses study-specific threats: single-input limitation, no RAG data support, prompt injection risks affecting specification extraction, LLM-as-a-judge unreliability (citing [42]), self-bias from using same model family for generation and evaluation, and formatting/parsing issues." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": true, 185 "justification": "Section 4.1.3 explicitly states: 'Currently, we support prompts that can accept only a single input' and 'We also only support prompts where the output is independent of the previous outputs, making prompts describing multi-turn conversations for tasks out of scope.' Section 7 adds that RAG-dependent prompts are unsupported." 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": true, 192 "justification": "Section 9 states 'The samples directory contains the benchmarks used in the evaluation, and the eval directory contains the evaluation results as well as all artifacts generated during the evaluation' in the anonymous repository." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": false, 197 "justification": "Section 4.1.3 says prompts were selected from 'publicly available sources' but does not describe which sources, how they were found, or what inclusion/exclusion criteria beyond scope constraints were applied." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants. The data consists of benchmark prompts from public sources." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": false, 207 "justification": "The PromptPex pipeline is documented (Figure 6), but the pipeline from raw prompt collection to final evaluation results lacks documentation. There's no description of how many initial prompts were considered, how many were excluded at each stage, or how the final 22 were determined." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding acknowledgment or grant information is provided in the paper. Three of five authors are from Microsoft Research, and the first author's work was done while at Microsoft Research, but no explicit funding disclosure exists." 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Author affiliations are clearly listed: University of Washington and Microsoft Research. The first author's note states 'Work done while at Microsoft Research.'" 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "Three authors are from Microsoft Research, and the tool uses Microsoft-associated models (gpt-5, o4-mini via APIs). Microsoft has a financial interest in demonstrating the value of LLM-based development tools. No discussion of funder independence." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests or financial interests statement is present in the paper. Microsoft Research employees evaluating an approach built on Microsoft-associated models creates a potential undisclosed conflict." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": true, 235 "answer": false, 236 "justification": "No training data cutoff dates are stated for any of the models used (gpt-5_2025-08-07, o4-mini_2025-04-16, gpt-oss, gemma2:9b, qwen2.5:3b, llama3.2:1b). The benchmark prompts are from public sources and could be in training data." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": true, 240 "answer": false, 241 "justification": "The 22 benchmark prompts are from 'publicly available sources' and could have been seen during model training. No discussion of whether models have encountered these prompts or similar prompts during training." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": true, 245 "answer": false, 246 "justification": "The benchmark prompts are publicly available and some models could have been trained on them. No contamination analysis is performed or discussed." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study. Evaluation is entirely automated using LLM-generated tests and LLM-as-a-judge." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": false, 290 "justification": "No API costs, tokens consumed, or wall-clock time are reported for running PromptPex or the baseline. The multi-stage pipeline presumably costs more than the single-call baseline but this is not quantified." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "Section 4.1.4 describes hardware (AMD EPYC 7V13, 220GB RAM, 4 NVIDIA A100 GPUs) but does not state total GPU hours, API spend, or wall-clock time for the evaluation." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "Section 4.1.4 explicitly states 'We ran each test once per prompt per model.' No multiple seeds, no sensitivity analysis." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": true, 307 "justification": "Section 4.1.4 explicitly states 'We ran each test once per prompt per model.' The number of runs (1) is clearly stated." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": false, 312 "justification": "Section 4.1.1 states the baseline prompt 'underwent multiple revisions to enhance its ability to generate effective tests' and 'We have refined both approaches sufficiently so that they can be effectively compared,' but no search budget, number of configurations, or selection methodology is described." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": false, 317 "justification": "The paper does not describe how the final prompts (for IS extraction, OR extraction, test generation, etc.) were selected from alternatives. The baseline prompt revision process is mentioned but not documented." 318 }, 319 "multiple_comparison_correction": { 320 "applies": false, 321 "answer": false, 322 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The authors built both PromptPex and the baseline test generator. They acknowledge self-bias risk for LLM-as-a-judge (Section 7) but do not address the bias of authors evaluating their own system vs their own baseline implementation." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": false, 332 "justification": "PromptPex uses a multi-stage pipeline (IS extraction + OR extraction + inverse rule generation + test generation) requiring many more LLM calls than the single-call baseline. This compute cost difference is not quantified or discussed." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": false, 337 "justification": "The paper does not discuss whether % non-compliance (as judged by an LLM) actually measures test quality. The connection between higher non-compliance and better tests is assumed rather than validated. Section 3.4 acknowledges 'our evaluation explores only a partial understanding' but does not formally examine construct validity." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": false, 341 "answer": false, 342 "justification": "PromptPex IS the scaffold being evaluated. The comparison is between PromptPex's pipeline vs a simpler baseline approach. The scaffold is the thing being tested, not a confound." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": false, 349 "justification": "The benchmark prompts are from public sources that predate model training. No discussion of whether models may have seen these prompts or similar prompts during training." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "No discussion of whether the evaluation setup leaks information. For example, gpt-5_2025-08-07 is used for both test generation and as the model family for the judge (o4-mini_2025-04-16), creating potential information leakage." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "No discussion of whether the 22 benchmark prompts are independent of each other or share structural similarities that could inflate results." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": false, 364 "justification": "No leakage detection or prevention methods are applied. No canary strings, membership inference, or decontamination is discussed." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "PromptPex consistently generates tests that result in more non-compliant model outputs than a carefully constructed baseline LLM-based test generator across all four models tested.", 371 "evidence": "Figure 9 shows average % non-compliance across benchmarks for PromptPex vs baseline for gpt-oss, gemma2:9b, qwen2.5:3b, and llama3.2:1b. PromptPex shows higher non-compliance for every model, with the largest gap for gpt-oss (Section 4.2, Q1).", 372 "supported": "moderate" 373 }, 374 { 375 "claim": "Tests generated from inverse rules result in greater non-compliance than tests from direct rules for all models under test.", 376 "evidence": "Figure 10 compares non-compliance of tests from rules vs inverse rules. Inverse rules show higher non-compliance for all four models, with the best model (gpt-oss) showing the largest increase (Section 4.2, Q2).", 377 "supported": "moderate" 378 }, 379 { 380 "claim": "Almost all PromptPex-generated tests are valid according to the extracted input specification.", 381 "evidence": "Figure 12 shows the number of valid tests vs total tests per benchmark. Nearly every generated test is deemed valid (Section 4.2, Q3).", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "PromptPex tests can differentiate model capabilities and determine model suitability for a given prompt.", 386 "evidence": "Figure 13 shows per-benchmark non-compliance across models. More capable models (gpt-oss) have lower non-compliance while smaller models (llama3.2:1b) have higher rates, with some smaller models excelling on specific prompts (Section 4.2, Q4).", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "Extracted output rules achieve 89% average groundedness and 96.8% spec agreement with the original prompts.", 391 "evidence": "Section 4.2 reports 89% average rule groundedness with the classify prompt as exception, and 96.8% spec agreement (99.9% excluding the shakespeare outlier).", 392 "supported": "moderate" 393 } 394 ], 395 "red_flags": [ 396 { 397 "flag": "Single-run evaluation with no statistical testing", 398 "detail": "Each test was run exactly once per prompt per model (Section 4.1.4). With LLM non-determinism at temperature 1.0, results could vary substantially across runs. No significance tests, confidence intervals, or variance measures are reported, making it impossible to assess whether observed differences are reliable." 399 }, 400 { 401 "flag": "Self-evaluation bias", 402 "detail": "The authors built both PromptPex and the baseline test generator, and designed both evaluation metrics and the benchmark selection. The baseline prompt 'underwent multiple revisions' but the revision process is not documented. This creates systematic bias where the authors' system may be inadvertently favored." 403 }, 404 { 405 "flag": "LLM-as-a-judge circularity", 406 "detail": "All evaluation relies on LLM-as-a-judge (o4-mini_2025-04-16), which is from the same model family (OpenAI) as the test generator (gpt-5_2025-08-07). Section 7 acknowledges self-bias risk but does not mitigate it. The paper's core metric (non-compliance) is only as reliable as the LLM judge." 407 }, 408 { 409 "flag": "Undisclosed conflicts of interest", 410 "detail": "Three of five authors are from Microsoft Research, and the tool uses Microsoft-associated API models. No funding disclosure, no competing interests statement, and no acknowledgment that Microsoft has a financial interest in demonstrating the value of LLM-based development tools." 411 }, 412 { 413 "flag": "Unfair compute comparison with baseline", 414 "detail": "PromptPex uses a multi-stage pipeline (IS extraction + OR extraction + inverse rule generation + test generation) requiring many more LLM calls than the single-call baseline. The cost difference is never quantified or discussed, making the comparison unfair without cost normalization." 415 } 416 ], 417 "cited_papers": [ 418 { 419 "title": "Retain: Interactive tool for regression testing guided llm migration", 420 "authors": ["Tanay Dixit", "Daniel Lee", "Sally Fang", "Sai Sree Harsha", "Anirudh Sureshan", "Akash Maharaj", "Yunyao Li"], 421 "year": 2024, 422 "relevance": "Directly related work on using unit tests for model migration of prompts." 423 }, 424 { 425 "title": "An empirical evaluation of using large language models for automated unit test generation", 426 "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"], 427 "year": 2024, 428 "relevance": "Empirical evaluation of LLMs for automated unit test generation in traditional software." 429 }, 430 { 431 "title": "SPADE: Synthesizing data quality assertions for large language model pipelines", 432 "authors": ["Shreya Shankar", "Haotian Li", "Parth Asawa"], 433 "year": 2024, 434 "relevance": "Automatically generates assertions over LLM pipeline outputs for regression checking." 435 }, 436 { 437 "title": "Pex - white box test generation for .NET", 438 "authors": ["Nikolai Tillmann", "Peli de Halleux"], 439 "year": 2008, 440 "relevance": "Foundational work on automated test generation using dynamic symbolic execution that inspired PromptPex." 441 }, 442 { 443 "title": "DSPy: Compiling declarative language model calls into self-improving pipelines", 444 "authors": ["Omar Khattab", "Arnav Singhvi", "Paridhi Maheshwari"], 445 "year": 2023, 446 "arxiv_id": "2310.03714", 447 "relevance": "Prompt optimization framework that requires equivalence checking test suites, related to prompt testing." 448 }, 449 { 450 "title": "Automatic prompt optimization with 'gradient descent' and beam search", 451 "authors": ["Reid Pryzant", "Dan Iter", "Jerry Li", "Yin Tat Lee", "Chenguang Zhu", "Michael Zeng"], 452 "year": 2023, 453 "arxiv_id": "2305.03495", 454 "relevance": "Prompt optimization technique that relies on test suites for equivalence checking." 455 }, 456 { 457 "title": "PromptFuzz: Harnessing fuzzing techniques for robust testing of prompt injection in LLMs", 458 "authors": ["Jiahao Yu", "Yangguang Shao", "Hanwen Miao", "Junzheng Shi", "Xinyu Xing"], 459 "year": 2024, 460 "relevance": "Prompt fuzzing for testing prompt injection vulnerabilities, complementary approach to PromptPex." 461 }, 462 { 463 "title": "Large language models are not fair evaluators", 464 "authors": ["Peiyi Wang", "Lei Li", "Liang Chen"], 465 "year": 2023, 466 "arxiv_id": "2305.17926", 467 "relevance": "Demonstrates LLM-as-a-judge reliability concerns directly relevant to PromptPex's evaluation methodology." 468 }, 469 { 470 "title": "Judging LLM-as-a-judge with MT-bench and chatbot arena", 471 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 472 "year": 2023, 473 "relevance": "Foundational work on LLM-as-a-judge evaluation methodology used by PromptPex." 474 }, 475 { 476 "title": "Specifications: The missing link to making the development of LLM systems an engineering discipline", 477 "authors": ["Ion Stoica", "Matei Zaharia", "Joseph Gonzalez"], 478 "year": 2024, 479 "arxiv_id": "2412.05299", 480 "relevance": "Highlights importance of specifications in prompt engineering for reliability, directly motivating PromptPex's specification extraction approach." 481 }, 482 { 483 "title": "ChatUniTest: a ChatGPT-based automated unit test generation tool", 484 "authors": ["Zhuokui Xie", "Yinghao Chen", "Chen Zhi", "Shuiguang Deng", "Jianwei Yin"], 485 "year": 2023, 486 "arxiv_id": "2305.04764", 487 "relevance": "LLM-based automated unit test generation for traditional code, baseline comparison context for prompt testing." 488 }, 489 { 490 "title": "No more manual tests? Evaluating and improving ChatGPT for unit test generation", 491 "authors": ["Zhiqiang Yuan", "Yiling Lou", "Mingwei Liu"], 492 "year": 2023, 493 "arxiv_id": "2305.04207", 494 "relevance": "Evaluates LLM capability for unit test generation, relevant to understanding baseline test generation quality." 495 } 496 ], 497 "engagement_factors": { 498 "practical_relevance": { 499 "score": 3, 500 "justification": "PromptPex is a directly usable tool for prompt developers to test and debug their prompts across models, with source code released." 501 }, 502 "surprise_contrarian": { 503 "score": 1, 504 "justification": "The idea that prompts need testing is not surprising; the specification extraction approach is novel but does not challenge conventional wisdom." 505 }, 506 "fear_safety": { 507 "score": 0, 508 "justification": "No AI risk or security concerns raised; the paper focuses on software testing quality." 509 }, 510 "drama_conflict": { 511 "score": 0, 512 "justification": "No controversy or conflicting claims with existing work." 513 }, 514 "demo_ability": { 515 "score": 2, 516 "justification": "Source code is available in an anonymous repository but it's not a pip-installable package or live demo." 517 }, 518 "brand_recognition": { 519 "score": 2, 520 "justification": "Microsoft Research is a well-known lab; the tool uses high-profile models (gpt-5, o4-mini) but is not about a flagship product." 521 } 522 } 523 }