scan.json (27123B)
1 { 2 "paper": { 3 "title": "ClarifyGPT: Empowering LLM-based Code Generation with Intention Clarification", 4 "authors": [ 5 "Fangwen Mu", 6 "Lin Shi", 7 "Song Wang", 8 "Zhuohao Yu", 9 "Binquan Zhang", 10 "ChenXue Wang", 11 "Shichao Liu", 12 "Qing Wang" 13 ], 14 "year": 2023, 15 "venue": "arXiv preprint (submitted to ACM)", 16 "arxiv_id": "2310.10996", 17 "doi": "10.1145/nnnnnnn.nnnnnnn" 18 }, 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "A GitHub repository is provided: https://github.com/ClarifyGPT/ClarifyGPT (Reference [1]). The paper states 'publicly accessible dataset and source code [1] to facilitate the replication of our study.'" 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "The paper uses publicly available benchmarks (HumanEval, MBPP-sanitized, HumanEval-ET, MBPP-ET) and the GitHub repository is stated to contain the dataset. The benchmarks themselves are public." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "The paper specifies model configuration parameters (temperature, top_p, max_tokens) but does not provide a requirements.txt, Dockerfile, or detailed environment setup section listing library versions." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "No step-by-step reproduction instructions are provided in the paper. The paper references the GitHub repository but does not include a 'Reproducing Results' section or specific commands to run experiments." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "Results are reported as point estimates (e.g., 80.80%, 60.19%) across Tables 2, 3, and 4 with no confidence intervals, error bars, or ± notation despite running each approach three times." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper claims ClarifyGPT 'significantly' improves performance and 'substantially' improves results but provides no statistical significance tests (no p-values, t-tests, or other tests). Comparisons are based solely on comparing percentage point differences." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "The paper reports relative improvements with baseline context throughout. For example: 'ClarifyGPT elevates the performance (Pass@1) of GPT-4 from 70.96% to 80.80%' (Table 2), with relative improvement percentages (13.87%, 16.83%). This provides enough context to assess magnitude." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "The human evaluation uses only 10 participants with no justification for why this number was chosen and no power analysis. The number of benchmark problems is inherited from existing benchmarks without justification for adequacy." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "The paper states 'we run each approach three times and report the average results as the final results' (Section 4.4) but does not report standard deviation, variance, or any spread measure across the three runs." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "Three baselines are included: Default LLM, Chain-of-Thought (CoT), and GPT-Engineer (Section 4.5). Comparisons are shown in Tables 2 and 3." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "The baselines are contemporary: CoT (Wei et al. 2022) and GPT-Engineer (Osika, 2023) are recent approaches. GPT-Engineer is the most directly comparable system for interactive code generation." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": false, 83 "justification": "No ablation study is performed to measure the individual contribution of the main components (test input generation, code consistency check, reasoning-based question generation). RQ3 varies the number of demonstrations but does not ablate framework components." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": false, 88 "justification": "Only Pass@1 is used as the evaluation metric. No other metrics (e.g., Pass@k for k>1, code quality metrics, question quality metrics) are reported." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": true, 93 "justification": "RQ1 (Section 5.1) includes a human evaluation with 10 participants who use ClarifyGPT for code generation on MBPP-sanitized and MBPP-ET benchmarks. Participants answered clarifying questions generated by the system." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": true, 98 "justification": "The benchmarks use dedicated test cases for evaluation. The three demonstration examples are taken from the first three problems of each benchmark (Section 4.6), while the remaining problems serve as the evaluation set." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Results are broken down per benchmark (HumanEval, HumanEval-ET, MBPP-sanitized, MBPP-ET) and per model (ChatGPT, GPT-4) in Tables 2, 3, and 4, rather than only reporting overall averages." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 6.1 (Case Study) discusses qualitative examples including cases where baselines fail. Section 6.2 (Limitations) discusses cases where ClarifyGPT is not suitable, such as code with complex input types or code that does not return output values." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper reports that ClarifyGPT in the zero-shot setting shows only marginal improvement (e.g., 0.5% on HumanEval with GPT-4, 0.0% on HumanEval-ET), effectively near-zero improvement without demonstrations (Table 4). The paper also notes simulated feedback performs slightly worse than human feedback." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "The abstract claims GPT-4 performance improves from 70.96% to 80.80% on MBPP-sanitized (supported by Table 2), and average improvement from 68.02% to 75.75% for GPT-4 and 58.55% to 67.22% for ChatGPT (supported by Table 3). All numerical claims match the results." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper makes causal claims such as 'ClarifyGPT can effectively facilitate the practical application of LLMs' and attributes improvements to specific components ('We attribute the improvements to our novel techniques, i.e., ambiguous requirement identification and clarifying question generation'). However, without an ablation study, the causal contribution of individual components is not established." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper claims 'ClarifyGPT can significantly facilitate the practical application of LLMs in real-world development environments' but tests only on Python function-level code generation benchmarks with only two LLMs (GPT-4 and ChatGPT). The title 'Empowering LLM-based Code Generation' is broader than what is tested." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "The threats to validity section (Section 6.3) discusses data leakage, simulation fidelity, and generalizability but does not consider alternative explanations for the observed improvements. For instance, the improvement could partly stem from additional LLM calls (more compute) rather than the specific clarification mechanism. No such alternatives are discussed." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper states 'gpt-3.5-turbo' and 'gpt-4-turbo' (Section 4.2) but does not specify snapshot dates or exact version identifiers (e.g., gpt-3.5-turbo-0613 or gpt-4-turbo-2023-xx-xx). These API model names can change behavior over time." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Figure 3 shows the detailed prompts used in ClarifyGPT for all four stages: seed input initialization, reasoning-based question generation, user simulation, and enhanced code generation. The actual prompt text with instructions, demonstrations, and query structure are provided." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 4.6 reports: top_p=0.95, frequency_penalty=0, max_tokens=800 for question generation and 300 for others, temperature=0 for most operations and 0.8 for code sampling. Stop sequences are also specified." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "The four-stage pipeline (test input generation, code consistency check, reasoning-based question generation, enhanced code generation) is described in detail in Section 3 with Figure 1 providing an overview. The workflow, prompting strategy, and mutation logic are all documented." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 4.6 describes prompt construction: demonstration seeds are selected from the first three problems of each benchmark, and demonstrations are manually created. Section 4.3 describes the benchmarks and their characteristics. The data pipeline from requirement input to code output is documented." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 6.2 'Benefits and Limitations' provides substantive discussion of limitations, and Section 6.3 'Threats to Validity' discusses three specific threats." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 6.3 discusses specific threats: (1) data leakage from LLM training data overlapping with benchmarks, (2) user simulation fidelity, and (3) generalizability across only two LLMs and four datasets. These are specific to this study." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 6.2 explicitly states that ClarifyGPT requires instruction-tuned LLMs (not applicable to InCoder, CodeGen), is not suitable for code with complex inputs (images, files), and may have limitations for code without return values (e.g., deep learning programs)." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "While the benchmarks are public, the raw experimental outputs (generated code solutions, generated questions, participant responses from the human evaluation) are not made available for independent verification." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 5.1 describes the human evaluation data collection in detail: 140 ambiguous problems identified, 2.85 average questions per problem, three questionnaires per problem, and participant assignment of 42 problems each." 192 }, 193 "recruitment_methods_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 5.1 describes the 10 participants: three PhD students, two Master's students, two senior researchers, and three industry developers. All have at least 3 years of Python experience (6 with 5+ years). They were trained with example questionnaires before the study." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline from requirement input through ambiguity detection, question generation, participant answers, to final code generation is documented. Section 5.1 describes the flow: 427 problems → 140 identified as ambiguous → questionnaires distributed → answers collected → code generated → evaluated." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information or acknowledgments section is present in the paper. One co-author (Shichao Liu) is from Huawei, suggesting potential industry funding, but no funding source is disclosed." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All author affiliations are listed on page 1, including Shichao Liu from 'Software IDE innovation Lab, Huawei Central Software Institute, China.' Academic affiliations at CAS and Beihang University are also listed." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "One author is from Huawei, a company that develops and sells AI/LLM-powered development tools. No funding disclosure is provided, so independence of funding from outcomes cannot be assessed. The paper evaluates OpenAI models rather than Huawei products, but the general area of LLM-based code generation is commercially relevant to Huawei." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement or financial interests declaration is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper does not state the training data cutoff dates for GPT-4 or ChatGPT (gpt-3.5-turbo). This is relevant because the benchmarks (HumanEval, MBPP) were published before GPT-4's training cutoff." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": true, 235 "justification": "Section 6.3 discusses this: 'Since these LLMs are trained on open-source code repositories, it is possible that some public benchmarks were included in their training data.' The paper acknowledges the potential for overlap." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "While Section 6.3 acknowledges the contamination risk, the mitigation offered ('we carefully select HumanEval and MBPP-sanitized') is insufficient. HumanEval was published in 2021 and MBPP in 2021 — both were very likely in GPT-4's training data. No actual contamination analysis (e.g., canary strings, membership inference) is performed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": true, 246 "answer": false, 247 "justification": "No pre-registration is mentioned. The human evaluation with 10 participants is not pre-registered on any platform (OSF, AsPredicted, etc.)." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": true, 251 "answer": false, 252 "justification": "No IRB or ethics board approval is mentioned despite the study involving 10 human participants who answered questionnaires." 253 }, 254 "demographics_reported": { 255 "applies": true, 256 "answer": true, 257 "justification": "Section 5.1 reports demographics: three PhD students, two Master's students, two senior researchers, three industry developers. Experience levels are noted: all have 3+ years of Python experience, six with 5+ years." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": true, 261 "answer": false, 262 "justification": "The paper states participants have 'at least three years of experience in Python development' but does not describe formal inclusion/exclusion criteria or a screening process. The 10 participants were 'recruited' without description of how they were selected." 263 }, 264 "randomization_described": { 265 "applies": true, 266 "answer": false, 267 "justification": "The paper states '42 problems' were assigned to each participant and 'Each problem will be solved by three participants' but does not describe how the assignment was performed or whether it was randomized." 268 }, 269 "blinding_described": { 270 "applies": true, 271 "answer": false, 272 "justification": "No blinding is described. Participants knew they were answering clarifying questions generated by ClarifyGPT. Whether they knew which system produced which output is not discussed." 273 }, 274 "attrition_reported": { 275 "applies": true, 276 "answer": false, 277 "justification": "No attrition information is reported. The paper does not state whether all 10 participants completed all their assigned tasks or if any dropped out." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No API costs, token counts, or latency measurements are reported despite ClarifyGPT requiring multiple LLM calls per problem (seed input generation, code sampling, question generation, user simulation, final code generation). The paper acknowledges GPT-Engineer 'results in an increase in token counts...consequently escalating operational expenses' but does not report ClarifyGPT's own costs." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total computational budget, API spend, or hardware information is stated. The experiments involve multiple LLM API calls across four benchmarks, two models, three runs each, but the total cost is not quantified." 290 } 291 } 292 }, 293 "claims": [ 294 { 295 "claim": "ClarifyGPT elevates GPT-4 Pass@1 on MBPP-sanitized from 70.96% to 80.80% with human feedback.", 296 "evidence": "Table 2 (Section 5.1) shows the human evaluation results across 10 participants on MBPP-sanitized benchmark.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "ClarifyGPT improves the average performance of GPT-4 across four benchmarks from 68.02% to 75.75% with simulated feedback.", 301 "evidence": "Table 3 (Section 5.2) shows automated evaluation results: HumanEval 87.80%, HumanEval-ET 78.05%, MBPP-sanitized 78.69%, MBPP-ET 58.47%, averaging 75.75%.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "ClarifyGPT improves the average performance of ChatGPT across four benchmarks from 58.55% to 67.22% with simulated feedback.", 306 "evidence": "Table 3 (Section 5.2) shows: HumanEval 74.39%, HumanEval-ET 64.84%, MBPP-sanitized 74.08%, MBPP-ET 55.58%, averaging 67.22%.", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "ClarifyGPT's user simulation method produces high-fidelity results close to human feedback.", 311 "evidence": "Comparison between ClarifyGPT (Human Feedback) and ClarifyGPT (Simulated Feedback) in Table 3 shows simulated results are slightly lower but in a similar range (e.g., 78.69% vs 80.80% on MBPP-sanitized with GPT-4).", 312 "supported": "moderate" 313 }, 314 { 315 "claim": "ClarifyGPT demonstrates robustness to the number of demonstrations in prompts.", 316 "evidence": "Table 4 (Section 5.3) shows consistent improvements from zero-shot to three-shot across all benchmarks and models.", 317 "supported": "moderate" 318 }, 319 { 320 "claim": "ClarifyGPT can significantly facilitate the practical application of LLMs in real-world development environments.", 321 "evidence": "Only tested on Python function-level benchmark problems (HumanEval, MBPP) with two LLMs. No real-world development environment testing was conducted.", 322 "supported": "weak" 323 } 324 ], 325 "methodology_tags": [ 326 "benchmark-eval" 327 ], 328 "key_findings": "ClarifyGPT is a framework that enhances LLM-based code generation by detecting ambiguous requirements via code consistency checking and generating targeted clarifying questions. In human evaluation with 10 participants, ClarifyGPT improved GPT-4 Pass@1 on MBPP-sanitized from 70.96% to 80.80%. Automated evaluations using a user simulation method showed average improvements of 11.52% for GPT-4 and 15.07% for ChatGPT across four Python code generation benchmarks. The framework's performance scales with the number of few-shot demonstrations, with one-shot already capturing most of the improvement.", 329 "red_flags": [ 330 { 331 "flag": "No variance or error bars despite multiple runs", 332 "detail": "The paper runs each experiment three times and reports averages but never reports standard deviation, confidence intervals, or any spread measure. Without this, the reader cannot assess whether observed differences are reliable or within noise." 333 }, 334 { 335 "flag": "No statistical significance tests", 336 "detail": "The paper uses words like 'significantly' and 'substantially' to describe improvements but provides no statistical tests. Differences between methods could be within random variation." 337 }, 338 { 339 "flag": "No ablation study", 340 "detail": "The framework has four distinct stages but no ablation study isolates the contribution of each component. It is unclear whether the improvement comes from the consistency check, the question generation, the reasoning-based prompting, or simply from additional LLM calls and refined prompts." 341 }, 342 { 343 "flag": "User simulation provides ground truth information", 344 "detail": "The simulated user feedback method gives the LLM access to ground-truth test cases when generating responses. This creates an unfair advantage: the simulated user has perfect knowledge of the expected behavior, which real users would not have. The simulated feedback results may overestimate real-world performance." 345 }, 346 { 347 "flag": "Small human evaluation sample", 348 "detail": "Only 10 participants were used for the human evaluation, with no power analysis or justification for this number. The study has no IRB approval, no formal inclusion/exclusion criteria, and no attrition reporting." 349 }, 350 { 351 "flag": "Potential benchmark contamination unaddressed", 352 "detail": "HumanEval (2021) and MBPP (2021) were very likely in the training data of GPT-4 and ChatGPT. The paper acknowledges this risk but performs no contamination analysis. Improvements may partly reflect the model already knowing the answers and benefiting from disambiguating hints." 353 } 354 ], 355 "cited_papers": [ 356 { 357 "title": "Evaluating Large Language Models Trained on Code", 358 "authors": ["Mark Chen", "Jerry Tworek"], 359 "year": 2021, 360 "arxiv_id": "2107.03374", 361 "relevance": "Introduces HumanEval benchmark and Codex, foundational work for evaluating LLM code generation capabilities." 362 }, 363 { 364 "title": "Program Synthesis with Large Language Models", 365 "authors": ["Jacob Austin", "Augustus Odena"], 366 "year": 2021, 367 "arxiv_id": "2108.07732", 368 "relevance": "Introduces MBPP benchmark used for evaluating code generation." 369 }, 370 { 371 "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models", 372 "authors": ["Jason Wei", "Xuezhi Wang"], 373 "year": 2022, 374 "arxiv_id": "2201.11903", 375 "relevance": "Foundational prompting technique (CoT) used as a baseline and inspiration for ClarifyGPT's reasoning-based prompting." 376 }, 377 { 378 "title": "CodeT: Code Generation with Generated Tests", 379 "authors": ["Bei Chen", "Fengji Zhang"], 380 "year": 2022, 381 "arxiv_id": "2207.10397", 382 "relevance": "Related approach using generated tests to improve code generation via dual execution agreement." 383 }, 384 { 385 "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation", 386 "authors": ["Jiawei Liu", "Chunqiu Steven Xia"], 387 "year": 2023, 388 "arxiv_id": "2305.01210", 389 "relevance": "Introduces extended test case benchmarks (HumanEval-ET, MBPP-ET) and type-aware mutation strategy used by ClarifyGPT." 390 }, 391 { 392 "title": "Self-collaboration Code Generation via ChatGPT", 393 "authors": ["Yihong Dong", "Xue Jiang"], 394 "year": 2023, 395 "arxiv_id": "2304.07590", 396 "relevance": "Related work on enhancing LLM code generation through multi-agent collaboration." 397 }, 398 { 399 "title": "Python Code Generation by Asking Clarification Questions", 400 "authors": ["Haau-Sing Li", "Mohsen Mesgar", "André F. T. Martins", "Iryna Gurevych"], 401 "year": 2023, 402 "doi": "10.18653/v1/2023.acl-long.799", 403 "relevance": "The only prior work addressing ambiguous requirements in code generation; directly compared and contrasted with ClarifyGPT." 404 }, 405 { 406 "title": "CODAMOSA: Escaping coverage plateaus in test generation with pre-trained large language models", 407 "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K. Lahiri", "Siddhartha Sen"], 408 "year": 2023, 409 "relevance": "LLM-based test generation approach relevant to understanding automated testing with language models." 410 }, 411 { 412 "title": "GPT-4 Technical Report", 413 "authors": ["OpenAI"], 414 "year": 2023, 415 "arxiv_id": "2303.08774", 416 "relevance": "Technical report for GPT-4, one of the two primary LLMs evaluated in this study." 417 }, 418 { 419 "title": "Interactive Code Generation via Test-Driven User-Intent Formalization", 420 "authors": ["Shuvendu K. Lahiri", "Aaditya Naik"], 421 "year": 2022, 422 "arxiv_id": "2208.05950", 423 "relevance": "Related interactive code generation approach using test-driven formalization of user intent." 424 }, 425 { 426 "title": "Enabling Programming Thinking in Large Language Models Toward Code Generation", 427 "authors": ["Jia Li", "Ge Li"], 428 "year": 2023, 429 "relevance": "Structured Chain-of-Thought (SCoT) for code generation, a prompting technique compared conceptually with ClarifyGPT's approach." 430 }, 431 { 432 "title": "Competition-Level Code Generation with AlphaCode", 433 "authors": ["Yujia Li", "David H. Choi"], 434 "year": 2022, 435 "arxiv_id": "2203.07814", 436 "relevance": "Large-scale code generation system demonstrating sampling and filtering approaches to code generation." 437 } 438 ] 439 }