scan.json (20776B)
1 { 2 "paper": { 3 "title": "CoPrompter: User-Centric Evaluation of LLM Instruction Alignment for Improved Prompt Engineering", 4 "authors": ["Ishika Joshi", "Simra Shahid", "Shreeya Venneti", "Manushree Vasu", "Yantao Zheng", "Yunyao Li", "Balaji Krishnamurthy", "Gromit Yeuk-Yin Chan"], 5 "year": 2024, 6 "venue": "arXiv", 7 "arxiv_id": "2411.06099" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "No repository URL or code archive is provided in the paper. The system is described but no link to source code is given." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No dataset, survey responses, or study data are released. The formative study and user evaluation data are not made available." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper mentions React frontend and FastAPI backend, and GPT-4o for evaluation, but no requirements.txt, Dockerfile, or detailed dependency specifications are provided." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided. The system design is described at a high level but there are no instructions for replicating the tool or experiments." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "SUS scores are reported as a mean (81.25) and range but no confidence intervals or error bars are provided." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "No statistical significance tests are used. Claims about CoPrompter improving alignment are based on qualitative observations and SUS scores without any formal tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": false, 46 "justification": "No effect sizes are reported. Improvements are described qualitatively (e.g., participants 'saw improved alignment scores') without quantified magnitudes." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The user evaluation has N=8. The paper states the small sample is due to the requirement for experienced prompt engineers and 75-90 minute sessions, but no formal justification or power analysis is provided." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "SUS scores are reported as a mean and range (Figure 9) but no standard deviation or variance is reported." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": false, 63 "justification": "No formal baseline comparison is included. The comparison to 'traditional methods' is based on participant interviews about their past experience, not a controlled comparison." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": false, 68 "justification": "Related tools like ChainForge, SPADE, EvalLM, and EvalGen are discussed in related work but not used as experimental baselines." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": false, 73 "justification": "CoPrompter has multiple components (criteria generation, update module, alignment report card) but no ablation study examines which components contribute most." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "The evaluation uses SUS scores (usability and efficiency subscales) and qualitative thematic analysis of user sessions." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": true, 83 "justification": "The core evaluation is a user study with 8 industry prompt engineers who used CoPrompter and provided qualitative feedback and SUS ratings (Section 6-7)." 84 }, 85 "held_out_test_set": { 86 "applies": false, 87 "answer": false, 88 "justification": "This is a user study of a tool, not a benchmark evaluation. No held-out test set is applicable." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "SUS scores are broken down into System Usability and System Efficiency categories (Figure 8), and qualitative findings are organized by research questions." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 7.3 discusses painpoints and challenges, including instances where alignment scores did not match participant expectations, LLM-as-judge trust issues, and latency concerns." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that P1 found the system complex, P2 found priority tags arbitrarily defined, P8 preferred CLI over UI, and participants identified cases where CoPrompter's scores didn't match their judgment." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims CoPrompter 'improves the ability to identify and refine instruction alignment' and 'helps in clarifying their own requirements.' These are supported by qualitative findings in Section 7, though evidence is qualitative not quantitative." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": false, 115 "justification": "The paper claims CoPrompter 'improves' alignment identification and helps users refine prompts, but the study design (no control group, no randomization, qualitative comparison to recalled past experience) is insufficient for causal inference." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title says 'User-Centric Evaluation' broadly but the study is limited to 8 participants from a single IT company (Adobe) doing content generation tasks. The paper does not explicitly bound generalization to this narrow setting." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No alternative explanations are discussed. Positive results could be due to novelty effect, demand characteristics, or the structured task format rather than the tool itself." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper states 'GPT-4o with temperature 0' for LLM_CGen and LLM_Eval (Section 5.2) but does not specify the exact model version/snapshot date." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper mentions prompts are 'detailed in Appendix 10.5' but the appendix text provided does not include the actual full prompt text — only references to it. The prompts for criteria generation, decomposition, and evaluation are described but not shown in full." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Temperature 0 is specified for LLM_CGen and LLM_Eval in Section 5.2." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The system's pipeline is described in detail across Sections 5.1 and 5.2: criteria generation, decomposition into atomic instructions, update module, response generation, and alignment report card module with workflow diagrams (Figures 2, 3)." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "The pipeline from guidelines to atomic instructions to criteria questions to evaluation is documented step by step in Section 5.2." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 8.3 'Future Work and Limitations' discusses limitations including focus study format, small sample size, and latency issues." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "The limitations are somewhat specific (mentioning sample size and focus study format) but do not address key validity threats like lack of control group, single-company recruitment, demand characteristics, or novelty effects." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper does not explicitly state what the results do NOT show. It acknowledges small sample size but does not bound claims to the specific participant pool, company, or task type tested." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "No raw data (survey responses, session recordings, transcripts, thematic codes) are made available." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "The formative study survey distribution (Slack channels and email threads) and user evaluation procedure (Microsoft Teams sessions, 75-90 minutes, recorded) are described in Sections 3 and 6." 182 }, 183 "recruitment_methods_described": { 184 "applies": true, 185 "answer": true, 186 "justification": "Formative study: survey circulated across Slack channels and email threads of prompt engineers at an IT company. User evaluation: snowball sampling from an IT company, with criteria focused on experience with long-form prompts (Sections 3, 6.1)." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 6.3 describes the analysis pipeline: recordings transcribed, thematic analysis approach, codes collected and categorized over three iterations using Miro." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding or acknowledgments section is present in the paper. Authors are from Adobe but no funding disclosure is made." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations with Adobe (and internship arrangements) are clearly listed on the first page." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "Authors are from Adobe. The tool is designed for prompt engineers, likely relevant to Adobe's products. No statement about funder independence is made, and Adobe has a potential interest in demonstrating prompt engineering tool value." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": false, 219 "answer": false, 220 "justification": "This paper evaluates a tool/workflow, not a pre-trained model's capability on a benchmark." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": false, 224 "answer": false, 225 "justification": "No benchmark evaluation of model capability is performed." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": false, 229 "answer": false, 230 "justification": "No benchmark evaluation of model capability is performed." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": true, 236 "answer": false, 237 "justification": "No pre-registration is mentioned for either the formative study or user evaluation." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": true, 241 "answer": false, 242 "justification": "No IRB or ethics board approval is mentioned despite collecting data from human participants in both studies." 243 }, 244 "demographics_reported": { 245 "applies": true, 246 "answer": true, 247 "justification": "For the user evaluation: 3 women and 5 men, 4 with product engineering experience and 4 with industrial research applications (Section 6.1). Formative study: 28 industry prompt engineers with experience levels described." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": true, 251 "answer": true, 252 "justification": "Section 6.1 states criteria: 'individuals experienced in crafting long-form prompts for models from OpenAI, Meta, and others, across various use cases.'" 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "This is not an experimental study with treatment/control conditions. All participants used CoPrompter." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "Not an experimental study with conditions requiring blinding." 263 }, 264 "attrition_reported": { 265 "applies": true, 266 "answer": false, 267 "justification": "No mention of whether any participants dropped out or whether all 8 completed the full study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "The paper mentions API call latency as a challenge but does not report actual API costs, tokens consumed, or wall-clock time for the pipeline." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No computational budget or total API spend is reported." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "CoPrompter improves prompt engineers' ability to identify and refine instruction alignment over traditional methods.", 286 "evidence": "Qualitative feedback from 8 participants in user evaluation (Section 7). Participants described traditional methods as 'trial and error' and stated CoPrompter provided structured misalignment identification. No quantitative comparison.", 287 "supported": "weak" 288 }, 289 { 290 "claim": "CoPrompter achieves a mean SUS score of 81.25, placing it in the 'A' range of excellence.", 291 "evidence": "SUS survey completed by 8 participants, mean score 81.25 reported in Section 7.4 and Figure 9, compared against industry average of 68.", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "Prompt engineers typically require over 10 iterations and evaluation of 40+ responses to optimize complex prompts.", 296 "evidence": "Formative survey of 28 industry prompt engineers (Section 3). 14/28 reported 10+ attempts, 14/28 reported testing 40+ responses.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "CoPrompter helps users clarify their own requirements through criteria decomposition.", 301 "evidence": "Qualitative observations from user study: participants discovered missing requirements and refined criteria during the evaluation process (Section 7.1.1). P4 quote about criteria generation being helpful.", 302 "supported": "moderate" 303 } 304 ], 305 "methodology_tags": ["qualitative", "case-study"], 306 "key_findings": "CoPrompter is a user-in-the-loop tool that decomposes prompt guidelines into atomic criteria questions and generates alignment reports for LLM responses. A formative study with 28 prompt engineers revealed that complex prompts require 10+ iterations and manual inspection of 40+ responses. A user evaluation with 8 industry prompt engineers showed qualitative improvements in misalignment identification and requirement clarity, with a mean SUS score of 81.25. The study is primarily qualitative with no controlled comparison to existing tools.", 307 "red_flags": [ 308 { 309 "flag": "Very small sample size for claims made", 310 "detail": "The user evaluation has only 8 participants, all from a single company (Adobe), yet the paper makes broad claims about CoPrompter's effectiveness for prompt engineers generally." 311 }, 312 { 313 "flag": "No controlled comparison", 314 "detail": "CoPrompter is compared to 'traditional methods' only through participant recall of past experiences, not through a controlled study. Related tools (ChainForge, SPADE, EvalLM, EvalGen) are discussed but never compared experimentally." 315 }, 316 { 317 "flag": "Potential demand characteristics", 318 "detail": "Participants were shown a demo by the research team, then asked to use the tool and provide feedback. The social desirability bias in this setting (especially with Adobe employees evaluating an Adobe research tool) is not addressed." 319 }, 320 { 321 "flag": "Company evaluating its own tool", 322 "detail": "Adobe researchers built CoPrompter and recruited Adobe employees to evaluate it. This conflict of interest is not acknowledged in the paper." 323 } 324 ], 325 "cited_papers": [ 326 { 327 "title": "ChainForge: An open-source visual programming environment for prompt engineering", 328 "authors": ["Ian Arawjo", "Priyan Vaithilingam", "Martin Wattenberg", "Elena Glassman"], 329 "year": 2023, 330 "relevance": "Open-source tool for prompt engineering evaluation, directly related to LLM prompt development workflows." 331 }, 332 { 333 "title": "Who Validates the Validators? Aligning LLM-Assisted Evaluation of LLM Outputs with Human Preferences", 334 "authors": ["Shreya Shankar", "JD Zamfirescu-Pereira", "Björn Hartmann", "Aditya G Parameswaran", "Ian Arawjo"], 335 "year": 2024, 336 "arxiv_id": "2404.12272", 337 "relevance": "Framework for human-in-the-loop LLM evaluation that CoPrompter builds upon." 338 }, 339 { 340 "title": "EvalLM: Interactive evaluation of large language model prompts on user-defined criteria", 341 "authors": ["Tae Soo Kim", "Yoonjoo Lee", "Jamin Shin", "Young-Ho Kim", "Juho Kim"], 342 "year": 2024, 343 "relevance": "Interactive LLM evaluation tool using user-defined criteria, closely related to CoPrompter's approach." 344 }, 345 { 346 "title": "SPADE: Synthesizing assertions for large language model pipelines", 347 "authors": ["Shreya Shankar", "Haotian Li", "Parth Asawa"], 348 "year": 2024, 349 "arxiv_id": "2401.03038", 350 "relevance": "Tool for automatically generating evaluation criteria for LLM pipelines." 351 }, 352 { 353 "title": "Why Johnny Can't Prompt: How Non-AI Experts Try (and Fail) to Design LLM Prompts", 354 "authors": ["J.D. Zamfirescu-Pereira", "Richmond Y. Wong", "Bjoern Hartmann", "Qian Yang"], 355 "year": 2023, 356 "relevance": "User study on prompt engineering challenges, directly relevant to understanding prompt engineering difficulties." 357 }, 358 { 359 "title": "Interactive AI Alignment: Specification, Process, and Evaluation Alignment", 360 "authors": ["Michael Terry", "Chinmay Kulkarni", "Martin Wattenberg", "Lucas Dixon", "Meredith Ringel Morris"], 361 "year": 2023, 362 "relevance": "Foundational framework for human-AI alignment that CoPrompter's alignment definition builds upon." 363 }, 364 { 365 "title": "Towards Bidirectional Human-AI Alignment: A Systematic Review for Clarifications, Framework, and Future Directions", 366 "authors": ["Hua Shen"], 367 "year": 2024, 368 "arxiv_id": "2406.09264", 369 "relevance": "Systematic review of human-AI alignment concepts relevant to the survey's scope on AI alignment methodology." 370 }, 371 { 372 "title": "FollowBench: A multi-level fine-grained constraints following benchmark for large language models", 373 "authors": ["Yuxin Jiang"], 374 "year": 2023, 375 "arxiv_id": "2310.20410", 376 "relevance": "Benchmark for evaluating LLM instruction-following capability." 377 }, 378 { 379 "title": "InFoBench: Evaluating Instruction Following Ability in Large Language Models", 380 "authors": ["Yiwei Qin"], 381 "year": 2024, 382 "arxiv_id": "2401.03601", 383 "relevance": "Benchmark for LLM instruction following evaluation." 384 }, 385 { 386 "title": "Calibrating LLM-Based Evaluator", 387 "authors": ["Yuxuan Liu"], 388 "year": 2024, 389 "relevance": "Methods for calibrating LLM-as-judge evaluators, relevant to LLM evaluation methodology." 390 } 391 ] 392 }