scan.json (19950B)
1 { 2 "paper": { 3 "title": "ClarifyGPT: Empowering LLM-based Code Generation with Intention Clarification", 4 "authors": ["Fangwen Mu", "Lin Shi", "Song Wang", "Zhuohao Yu", "Binquan Zhang", "ChenXue Wang", "Shichao Liu", "Qing Wang"], 5 "year": 2023, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2310.10996" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "GitHub repository provided: https://github.com/ClarifyGPT/ClarifyGPT (reference [1])." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses publicly available benchmarks (HumanEval, MBPP-sanitized, HumanEval-ET, MBPP-ET) and states 'publicly accessible dataset and source code' in contributions." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section found. Only API model names are mentioned." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions found in the paper. The GitHub link is provided but the paper itself lacks a reproducing results section." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Results in Tables 2-4 report only point estimates (e.g., 80.80%) with no confidence intervals or error bars, despite running experiments three times." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims ClarifyGPT outperforms baselines but provides no statistical significance tests. Comparisons are based solely on point estimate differences." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Relative improvements are reported with baseline context, e.g., 'improves the average performance of GPT-4 across four benchmarks from 68.02% to 75.75%' (Section 5.2)." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification for why 10 participants were recruited for the human evaluation, nor why 3 runs were chosen for averaging." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper states 'we run each approach three times and report the average results' (Section 4.4) but does not report standard deviation or any spread measure across runs." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Three baselines compared: Default LLM, Chain-of-Thought (CoT), and GPT-Engineer (Section 4.5)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "CoT (2022) and GPT-Engineer (2023) are contemporary baselines. GPT-Engineer is the most directly related work." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "RQ3 varies the number of demonstrations (zero to three-shot), functioning as an ablation of the prompt design component (Table 4)." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": false, 78 "justification": "Only Pass@1 is used as the evaluation metric. No other metrics (e.g., Pass@k for k>1, code quality measures) are reported." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": true, 83 "justification": "RQ1 involves 10 human participants answering clarifying questions and evaluating code generation on MBPP-sanitized and MBPP-ET (Section 5.1)." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "Evaluation uses standard benchmark test suites (HumanEval, MBPP-sanitized, etc.) which are separate from the demonstration examples selected from the first three problems." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down per benchmark (HumanEval, HumanEval-ET, MBPP-sanitized, MBPP-ET) in Tables 2-4." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 6.1 case study shows both success and failure cases. Section 6.2 Limitations discusses where ClarifyGPT fails (complex inputs, code without return values)." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The zero-shot setting shows marginal improvement (0.0-2.8%), reported honestly in Table 4. The paper also notes ClarifyGPT (Simulated Feedback) underperforms Human Feedback." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims (GPT-4 from 70.96% to 80.80% on MBPP-sanitized; average improvements of 11.52% and 15.07%) match Tables 2 and 3." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "Causal claims ('ClarifyGPT improves code generation') are supported by controlled comparisons where ClarifyGPT is the only variable changed against Default, CoT, and GPT-Engineer baselines." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The abstract claims ClarifyGPT 'can effectively facilitate the practical application of LLMs in real-world development environments' but tests only on Python function-level benchmarks with short requirements. Section 6.2 acknowledges limitations but the abstract/conclusion overgeneralize." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The threats to validity section (6.3) discusses data leakage and simulation fidelity but does not consider alternative explanations for the observed improvements (e.g., additional tokens/context providing more information regardless of clarification quality)." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Section 4.2 specifies 'gpt-3.5-turbo' for ChatGPT and 'gpt-4-turbo' for GPT-4, though no snapshot dates are given. These are API model names, not versioned snapshots." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Figure 3 shows the actual prompt templates used for seed input initialization, question generation, user simulation, and enhanced code generation, including instruction text and demonstration structure." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Section 4.6 reports top_p=0.95, frequency_penalty=0, max_tokens=800/300, temperature=0 (or 0.8 for sampling)." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The four-stage pipeline (test input generation, code consistency check, reasoning-based question generation, enhanced code generation) is described in detail in Section 3 with Figure 1." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 4.6 describes prompt construction (selecting first three problems as demonstration seeds). Section 4.3 describes benchmark statistics. The process from benchmark to evaluation is documented." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 6.2 'Benefits and Limitations' and Section 6.3 'Threats to Validity' provide substantive discussion." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Section 6.3 discusses specific threats: data leakage from training on public benchmarks, simulation fidelity concerns, and generalizability across only two LLMs and four datasets." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 6.2 Limitations explicitly states ClarifyGPT is not suitable for code with complex inputs (images, files) or code without return values, and requires instruction-tuned LLMs." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "Human evaluation responses from the 10 participants are not made available. Only aggregate Pass@1 results are reported." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 5.1 describes the human evaluation data collection: 140 ambiguous problems identified, questionnaires with three elements, each problem assessed by three participants." 182 }, 183 "recruitment_methods_described": { 184 "applies": true, 185 "answer": true, 186 "justification": "Section 5.1 describes participants: 'ten participants, including three Ph.D. students, two Master's students, two senior researchers, and three industry developers' with Python experience details." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The pipeline from requirement input through ambiguity detection (140 of 427 identified as ambiguous), question generation (avg 2.85 questions per problem), human response collection, to code generation is documented." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding or acknowledgments section found in the paper." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly listed, including one author from Huawei Central Software Institute." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information is disclosed, so independence cannot be assessed. One author is from Huawei, which has commercial interest in code generation tools." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement found in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper does not state training data cutoff dates for ChatGPT (gpt-3.5-turbo) or GPT-4 (gpt-4-turbo), despite evaluating on public benchmarks." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "Section 6.3 first threat discusses data leakage: 'Since these LLMs are trained on open-source code repositories, it is possible that some public benchmarks were included in their training data.'" 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "While Section 6.3 acknowledges the risk, the mitigation is weak — stating that benchmarks are 'manually crafted' and 'widely employed' does not address whether they appeared in training data. HumanEval (2021) and MBPP (2021) predate GPT-4's training." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": true, 236 "answer": false, 237 "justification": "No pre-registration mentioned for the human evaluation study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": true, 241 "answer": false, 242 "justification": "No IRB or ethics board approval mentioned despite involving 10 human participants." 243 }, 244 "demographics_reported": { 245 "applies": true, 246 "answer": true, 247 "justification": "Section 5.1 reports participant roles (PhD students, Master's students, senior researchers, industry developers) and Python experience levels (at least 3 years, 6 with 5+ years)." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": true, 251 "answer": false, 252 "justification": "No explicit inclusion/exclusion criteria stated. Participants 'have at least three years of experience in Python development' is mentioned but it is unclear if this was a selection criterion." 253 }, 254 "randomization_described": { 255 "applies": true, 256 "answer": false, 257 "justification": "The assignment of 42 problems per participant is mentioned but no randomization procedure is described for problem assignment." 258 }, 259 "blinding_described": { 260 "applies": true, 261 "answer": false, 262 "justification": "No blinding described. Participants answering clarifying questions likely knew they were evaluating ClarifyGPT." 263 }, 264 "attrition_reported": { 265 "applies": true, 266 "answer": false, 267 "justification": "No mention of whether all 10 participants completed all assigned problems or if there was any dropout." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No API costs, token counts, or latency reported despite ClarifyGPT requiring multiple LLM calls per problem (sampling n solutions, generating questions, simulating answers, final generation)." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total API spend or computational budget reported." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "ClarifyGPT elevates GPT-4 Pass@1 on MBPP-sanitized from 70.96% to 80.80% with human feedback.", 286 "evidence": "Table 2 in Section 5.1 shows this result from human evaluation with 10 participants.", 287 "supported": "moderate" 288 }, 289 { 290 "claim": "ClarifyGPT (Simulated Feedback) improves GPT-4 average Pass@1 across four benchmarks from 68.02% to 75.75%.", 291 "evidence": "Table 3 in Section 5.2 shows results across HumanEval, HumanEval-ET, MBPP-sanitized, MBPP-ET.", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "ClarifyGPT (Simulated Feedback) improves ChatGPT average Pass@1 from 58.55% to 67.22%.", 296 "evidence": "Table 3 in Section 5.2.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "ClarifyGPT demonstrates robustness to the number of demonstrations, consistently outperforming Default from zero-shot to three-shot.", 301 "evidence": "Table 4 in Section 5.3 shows consistent improvements, though zero-shot gains are marginal (0.0-2.8%).", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "The user simulation method produces high-fidelity simulated feedback close to real user responses.", 306 "evidence": "Comparison of ClarifyGPT (Human Feedback) vs (Simulated Feedback) in Table 3 shows similar but not identical results. Only tested on MBPP-sanitized/MBPP-ET overlap.", 307 "supported": "weak" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval", "case-study"], 311 "key_findings": "ClarifyGPT improves LLM code generation by detecting ambiguous requirements via code consistency checking and generating targeted clarifying questions. With human feedback, it improves GPT-4 Pass@1 by 13.87% on MBPP-sanitized. With simulated feedback, it achieves average improvements of 11.52% (GPT-4) and 15.07% (ChatGPT) across four benchmarks. The framework outperforms GPT-Engineer, which asks questions indiscriminately for all requirements.", 312 "red_flags": [ 313 { 314 "flag": "No statistical tests or variance reporting", 315 "detail": "Despite running experiments three times, no standard deviations, confidence intervals, or significance tests are reported. It is impossible to assess whether observed differences are statistically meaningful." 316 }, 317 { 318 "flag": "Simulated feedback uses ground-truth test cases", 319 "detail": "The user simulation method provides ground-truth test cases to the LLM to generate simulated answers. This gives the simulation access to information real users would not have, potentially inflating performance in the automated evaluation." 320 }, 321 { 322 "flag": "No inference cost analysis", 323 "detail": "ClarifyGPT requires multiple LLM calls per problem (sampling n code solutions, generating test inputs, generating questions, simulating answers, final generation) but reports no cost or latency comparison with baselines." 324 }, 325 { 326 "flag": "Small human evaluation sample", 327 "detail": "Only 10 participants with no power analysis, no IRB approval, and no randomization or blinding described. Results may not generalize." 328 } 329 ], 330 "cited_papers": [ 331 { 332 "title": "Evaluating Large Language Models Trained on Code", 333 "authors": ["Mark Chen"], 334 "year": 2021, 335 "arxiv_id": "2107.03374", 336 "relevance": "Introduces HumanEval benchmark and Codex, foundational for LLM code generation evaluation." 337 }, 338 { 339 "title": "Self-collaboration Code Generation via ChatGPT", 340 "authors": ["Yihong Dong", "Xue Jiang", "Zhi Jin", "Ge Li"], 341 "year": 2023, 342 "arxiv_id": "2304.07590", 343 "relevance": "Multi-agent collaboration approach for LLM code generation." 344 }, 345 { 346 "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation", 347 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 348 "year": 2023, 349 "arxiv_id": "2305.01210", 350 "relevance": "Introduces extended test suites (HumanEval-ET, MBPP-ET) and type-aware mutation used by ClarifyGPT." 351 }, 352 { 353 "title": "CodeT: Code Generation with Generated Tests", 354 "authors": ["Bei Chen"], 355 "year": 2022, 356 "arxiv_id": "2207.10397", 357 "relevance": "Uses generated tests to improve code generation quality, related post-processing approach." 358 }, 359 { 360 "title": "CODAMOSA: Escaping coverage plateaus in test generation with pre-trained large language models", 361 "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K Lahiri", "Siddhartha Sen"], 362 "year": 2023, 363 "relevance": "LLM-augmented test generation for software engineering." 364 }, 365 { 366 "title": "Python Code Generation by Asking Clarification Questions", 367 "authors": ["Haau-Sing Li", "Mohsen Mesgar", "André F. T. Martins", "Iryna Gurevych"], 368 "year": 2023, 369 "relevance": "Most directly related prior work on clarification questions for code generation." 370 }, 371 { 372 "title": "Interactive Code Generation via Test-Driven User-Intent Formalization", 373 "authors": ["Shuvendu K. Lahiri"], 374 "year": 2022, 375 "arxiv_id": "2208.05950", 376 "relevance": "Interactive code generation approach using test-driven intent formalization." 377 }, 378 { 379 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 380 "authors": ["Jason Wei"], 381 "year": 2022, 382 "relevance": "Foundational prompting technique used as both inspiration and baseline in this work." 383 }, 384 { 385 "title": "Program Synthesis with Large Language Models", 386 "authors": ["Jacob Austin"], 387 "year": 2021, 388 "arxiv_id": "2108.07732", 389 "relevance": "Introduces MBPP benchmark used in evaluation." 390 } 391 ] 392 }