scan.json (25717B)
1 { 2 "paper": { 3 "title": "CodeSift: An LLM-Based Reference-Less Framework for Automatic Code Validation", 4 "authors": [ 5 "Pooja Aggarwal", 6 "Brent Paulovicks", 7 "Oishik Chatterjee", 8 "Brad Blancett", 9 "Ting Dai", 10 "Arthur De Magalhaes", 11 "Prateeti Mohapatra" 12 ], 13 "year": 2024, 14 "venue": "arXiv preprint", 15 "arxiv_id": "2408.15630" 16 }, 17 "checklist": { 18 "artifacts": { 19 "code_released": { 20 "applies": true, 21 "answer": false, 22 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. The framework prompts are shown only as illustrations in Figure 1, not as a full reproducible artifact." 23 }, 24 "data_released": { 25 "applies": true, 26 "answer": false, 27 "justification": "The paper contributes a novel Bash dataset of 100 tasks but does not provide a download link or repository for it. HumanEval and MBPP+ are public, but the Bash dataset is the authors' own contribution and is not released." 28 }, 29 "environment_specified": { 30 "applies": true, 31 "answer": false, 32 "justification": "No requirements.txt, Dockerfile, conda environment, or library version details are provided. The paper mentions model names but not the software environment used to run experiments." 33 }, 34 "reproduction_instructions": { 35 "applies": true, 36 "answer": false, 37 "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided. A researcher would need to reverse-engineer the framework from the paper description." 38 } 39 }, 40 "statistical_methodology": { 41 "confidence_intervals_or_error_bars": { 42 "applies": true, 43 "answer": false, 44 "justification": "Tables I-III report only point estimates (e.g., '72.92' accuracy). No confidence intervals, error bars, or ± notation are provided." 45 }, 46 "significance_tests": { 47 "applies": true, 48 "answer": false, 49 "justification": "The paper claims CodeSift 'outperforms' baselines based solely on comparing accuracy numbers (e.g., 72.92 vs 70.79) without any statistical significance test (no p-values, t-tests, or bootstrap tests)." 50 }, 51 "effect_sizes_reported": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper reports raw accuracy differences (e.g., 'outperforms ICE-Score by an average of 8%') but no formal effect size measures like Cohen's d. The percentage differences are stated without baseline context in a standardized form." 55 }, 56 "sample_size_justified": { 57 "applies": true, 58 "answer": false, 59 "justification": "The Bash dataset has 100 tasks, HumanEval has 164 problems, and MBPP+ has 399 problems. No justification is given for why these sample sizes are adequate. The user study has only 3 SMEs with 105 instances and no justification for this sample size." 60 }, 61 "variance_reported": { 62 "applies": true, 63 "answer": false, 64 "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. The paper does not mention whether experiments were repeated multiple times. All results appear to be single-run numbers." 65 } 66 }, 67 "evaluation_design": { 68 "baselines_included": { 69 "applies": true, 70 "answer": true, 71 "justification": "The paper compares CodeSift against two baselines: ICE-Score and Reference Grading (LLM-as-a-judge framework). Results are presented in Tables I and III." 72 }, 73 "baselines_contemporary": { 74 "applies": true, 75 "answer": true, 76 "justification": "ICE-Score (2024) and Reference Grading from Zheng et al. (2023, NeurIPS) are recent methods. Both are contemporary prompt-based code evaluation approaches." 77 }, 78 "ablation_study": { 79 "applies": true, 80 "answer": true, 81 "justification": "Table II presents results for individual components of CodeSift: similarity analysis alone, difference analysis alone, and the combined ensemble. This serves as an ablation study showing each component's contribution." 82 }, 83 "multiple_metrics": { 84 "applies": true, 85 "answer": true, 86 "justification": "The paper reports both accuracy (Table I) and precision (Table III). These are two different evaluation metrics." 87 }, 88 "human_evaluation": { 89 "applies": true, 90 "answer": true, 91 "justification": "RQ2 describes a user study with 3 SMEs who evaluated 105 instances, reporting 78% agreement on code functionality and 83% agreement on validation output. This constitutes human evaluation of the system's outputs." 92 }, 93 "held_out_test_set": { 94 "applies": true, 95 "answer": true, 96 "justification": "The evaluation uses three separate benchmark datasets (HumanEval, MBPP+, Bash) as test sets. These are not used for tuning the framework — CodeSift does not involve training, so the benchmarks serve as held-out evaluation sets." 97 }, 98 "per_category_breakdown": { 99 "applies": true, 100 "answer": true, 101 "justification": "Results are broken down by dataset (HumanEval, Bash, MBPP+), by code generation model (Starcoder, GPT3.5, Codellama, Mistral), and by evaluation model (Mistral, Mixtral, Llama2-Chat) in Tables I-III." 102 }, 103 "failure_cases_discussed": { 104 "applies": true, 105 "answer": true, 106 "justification": "Table IV includes specific failure examples with explanations. The paper discusses failure modes in RQ4, such as incorrect functionality generation (third entry in Table IV) and inability to detect minor discrepancies (sixth entry)." 107 }, 108 "negative_results_reported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The paper reports that Llama2-Chat performs notably worse as an evaluator, and discusses cases where CodeSift fails. The Limitations section (Section VII) discusses incorrect functionality generation and verbose model outputs causing errors." 112 } 113 }, 114 "claims_and_evidence": { 115 "abstract_claims_supported": { 116 "applies": true, 117 "answer": true, 118 "justification": "The abstract claims CodeSift 'outperforms state-of-the-art code evaluation methods' and that 'output generated by CodeSift is in line with human preference.' Table I shows it outperforms baselines in most (but not all) settings, and RQ2 reports human alignment. The claims are broadly supported, though 'state-of-the-art' is somewhat strong given it does not always win." 119 }, 120 "causal_claims_justified": { 121 "applies": true, 122 "answer": true, 123 "justification": "The paper makes causal claims through ablation ('the ensemble approach involving integrating the outputs of both similarity analysis and difference analysis'). The ablation in Table II provides controlled comparison of components, which is adequate for these causal claims about component contributions." 124 }, 125 "generalization_bounded": { 126 "applies": true, 127 "answer": false, 128 "justification": "The paper tests on only Python and Bash across three datasets but the title and abstract frame it as a general 'Automatic Code Validation' framework. The conclusion says 'we plan to refine and expand the capabilities of CodeSift by exploring its performance across a broader range of programming languages,' implicitly acknowledging limited coverage, but the title and abstract do not bound the generalization." 129 }, 130 "alternative_explanations_discussed": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper does not discuss alternative explanations for why CodeSift outperforms baselines. For example, it does not consider whether the improvements could be due to prompt engineering advantages rather than the text-to-text comparison paradigm, or whether different prompt phrasings for baselines could close the gap." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": true, 139 "answer": false, 140 "justification": "The paper uses 'Mistral 7B', 'Mixtral 8x7B', 'Llama2-Chat 70B', 'Starcoder', 'Codellama 34B', and 'ChatGPT' without specific version identifiers or snapshot dates. 'ChatGPT' is particularly vague — no API version or snapshot date is given." 141 }, 142 "prompts_provided": { 143 "applies": true, 144 "answer": true, 145 "justification": "Figure 1 shows the three prompts used in CodeSift: Prompt 1 (code-to-functionality), Prompt 2 (similarity analysis), and Prompt 3 (difference analysis). The actual prompt text is provided with placeholders filled by the code and task. The paper also states baselines use prompts from their respective frameworks." 146 }, 147 "hyperparameters_reported": { 148 "applies": true, 149 "answer": true, 150 "justification": "Section V-A3 states: 'All the models use the sampling decoding method with 0.6 temperature and 1.2 repetition penalty.' For code generation, temperature of 0.2 is used (except GPT3.5 at 0.8). These are specific and usable." 151 }, 152 "scaffolding_described": { 153 "applies": true, 154 "answer": true, 155 "justification": "Section IV describes the full pipeline: syntax checking (ShellCheck, PyLint), code-to-functionality extraction, similarity analysis, difference analysis, and ensemble synthesis. Figure 1 illustrates the workflow. The scaffolding is described in sufficient detail." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section V-A1 describes data preparation: 10 solutions sampled per problem for HumanEval and Bash (temperature 0.2/0.8), greedy decoding for MBPP+. Section III describes the Bash dataset creation pipeline (prologue, execution, epilogue, evaluation, cleanup). Ground truth is defined as passing all test cases." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section VII is titled 'Limitations' and provides substantive discussion of the framework's weaknesses." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": true, 172 "justification": "The Limitations section discusses specific issues: incorrect functionality generation causing mislabeling, verbose model outputs making automatic correctness detection difficult, and inaccurate explanations from similarity/difference phases. These are specific to this study." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": false, 177 "justification": "The paper does not explicitly state what the results do NOT show. The conclusion mentions plans to 'expand across a broader range of programming languages' but does not explicitly bound the current scope (e.g., 'these results apply only to Python and Bash' or 'we did not test on complex multi-file programs'). The limitations discuss failure modes but not scope boundaries." 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": false, 184 "justification": "No raw data (generated code samples, model outputs, individual predictions) is made available for independent verification. Only aggregate results in tables are provided." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": true, 189 "justification": "Section V-A1 describes how code was generated: 10 solutions per problem using temperature 0.2 (or 0.8 for ChatGPT), greedy decoding for MBPP+. Section III describes how the Bash dataset was created with 100 tasks including prologues, epilogues, and test case evaluation." 190 }, 191 "recruitment_methods_described": { 192 "applies": true, 193 "answer": false, 194 "justification": "The user study mentions '3 Subject Matter Experts (SMEs)' but does not describe how they were recruited, their qualifications, or whether the selection could introduce bias. They appear to be internal IBM employees but this is not stated or discussed." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": true, 199 "justification": "The pipeline from code generation through syntax checking to semantic evaluation is documented in Section IV and Section V. The flow from raw generated code to final correctness labels is clear: generate code → syntax check → code-to-functionality → similarity analysis → difference analysis → ensemble decision." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": false, 206 "justification": "No funding disclosure, acknowledgments section, or grant information is provided in the paper." 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "All seven authors list IBM Research or IBM as their affiliation on the first page." 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": false, 216 "justification": "All authors are from IBM Research/IBM. IBM has a commercial interest in AI-powered code generation and validation tools. The paper does not disclose funding, but the work appears to be IBM-funded corporate research. The funder (IBM) has a stake in demonstrating effective AI code validation." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests statement or financial disclosure is provided. The paper has no declaration of conflicts of interest despite all authors being IBM employees working on a tool relevant to IBM's commercial interests." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": true, 227 "answer": false, 228 "justification": "The paper uses pre-trained models (Mistral, Mixtral, Llama2-Chat) as evaluators on HumanEval and MBPP+ benchmarks but does not state the training data cutoff dates for any of these models. HumanEval was published in 2021, and these models likely trained on data including it." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": true, 232 "answer": false, 233 "justification": "No discussion of whether the evaluator LLMs (Mistral, Mixtral, Llama2-Chat) may have seen HumanEval or MBPP problems during training. Since CodeSift uses LLMs to judge code-to-task alignment, familiarity with these benchmarks could influence evaluation behavior." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": true, 237 "answer": false, 238 "justification": "HumanEval (2021) and MBPP (2021) were published before the training cutoffs of all evaluator models used. The paper does not discuss whether this contamination could affect the evaluator LLMs' ability to assess code correctness on these benchmarks." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": true, 244 "answer": false, 245 "justification": "The user study with 3 SMEs is not pre-registered. No link to a pre-registration is provided." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": true, 249 "answer": false, 250 "justification": "No IRB or ethics board approval is mentioned for the user study involving 3 SMEs." 251 }, 252 "demographics_reported": { 253 "applies": true, 254 "answer": false, 255 "justification": "The 3 SMEs are described only as 'Subject Matter Experts.' No demographics — experience level, years of experience, expertise area, or other characteristics — are reported." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": true, 259 "answer": false, 260 "justification": "No inclusion or exclusion criteria are stated for the 3 SMEs. It is unclear how or why these specific experts were selected." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "The user study is not an experimental study with randomized conditions — it is a validation/agreement study where SMEs evaluate CodeSift outputs. Randomization is not applicable." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "The user study involves SMEs assessing CodeSift outputs for agreement. There are no treatment conditions to blind, so blinding is not applicable to this study design." 271 }, 272 "attrition_reported": { 273 "applies": true, 274 "answer": false, 275 "justification": "The paper states '3 Subject Matter Experts' and 'Feedback was received on 105 instances' but does not report whether any SMEs started and dropped out, or how many instances were originally assigned. No attrition information is provided." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": true, 281 "answer": false, 282 "justification": "CodeSift makes three LLM calls per code snippet (functionality, similarity, difference) plus potential syntax-fix calls, but no inference cost, latency, tokens consumed, or API costs are reported." 283 }, 284 "compute_budget_stated": { 285 "applies": true, 286 "answer": false, 287 "justification": "No information on total computational budget, GPU hours, hardware used, or wall-clock time for experiments is provided." 288 } 289 } 290 }, 291 "claims": [ 292 { 293 "claim": "CodeSift outperforms state-of-the-art code evaluation methods (ICE-Score and Reference Grading) in accuracy across three datasets.", 294 "evidence": "Table I shows CodeSift with Mistral outperforms both baselines on HumanEval and Bash, and CodeSift with Mixtral outperforms on MBPP+. However, performance is not uniformly better — some individual code-generation-model/dataset combinations show baseline advantages.", 295 "supported": "moderate" 296 }, 297 { 298 "claim": "CodeSift's assessments align with human expert preferences, with 78% agreement on code functionality and 83% on validation output.", 299 "evidence": "RQ2 describes a user study with 3 SMEs on 105 instances from an internal code generation pipeline. The 78% and 83% agreement rates are reported.", 300 "supported": "weak" 301 }, 302 { 303 "claim": "Text-to-text comparison is more effective than text-to-code comparison for LLM-based code evaluation.", 304 "evidence": "Table I shows CodeSift (text-to-text) outperforms ICE-Score (text-to-code) by an average of 8% on Python datasets. The paper argues this demonstrates LLMs' strength in text domain comparisons.", 305 "supported": "moderate" 306 }, 307 { 308 "claim": "The ensemble of similarity and difference analysis yields the most reliable outcomes.", 309 "evidence": "Table II shows individual phases (similarity analysis, difference analysis) compared to the ensemble CodeSift in Table I. The ensemble generally achieves better accuracy than either individual phase alone.", 310 "supported": "moderate" 311 }, 312 { 313 "claim": "CodeSift can detect functional errors that unit tests fail to capture, such as violations of non-functional requirements like 'do not use recursion.'", 314 "evidence": "RQ4 and the second example in Table IV show a case where test cases pass recursive code for a task that specifies no recursion, while CodeSift correctly identifies the violation.", 315 "supported": "moderate" 316 } 317 ], 318 "methodology_tags": [ 319 "benchmark-eval" 320 ], 321 "key_findings": "CodeSift is an LLM-based framework for code validation that converts code to natural language descriptions and then compares these descriptions against task specifications using similarity and difference analysis. Across HumanEval, MBPP+, and a novel Bash dataset, CodeSift generally outperforms ICE-Score and Reference Grading baselines in accuracy, with Mistral 7B being the most effective evaluator model. A small user study with 3 IBM subject matter experts on 105 instances showed 78-83% agreement with CodeSift's outputs. The paper demonstrates that LLMs can detect certain functional errors that unit tests miss, such as non-functional requirement violations.", 322 "red_flags": [ 323 { 324 "flag": "Tiny user study sample", 325 "detail": "The human evaluation (RQ2) uses only 3 subject matter experts on 105 instances. This is too small to draw generalizable conclusions about human alignment. The SMEs appear to be internal IBM employees, introducing potential selection bias." 326 }, 327 { 328 "flag": "No uncertainty quantification", 329 "detail": "All results are reported as single point estimates without confidence intervals, error bars, significance tests, or variance across runs. The accuracy differences between methods are often small (1-5 percentage points) and could be within noise." 330 }, 331 { 332 "flag": "Corporate conflict of interest", 333 "detail": "All 7 authors are IBM Research/IBM employees. IBM has commercial interest in AI code generation and validation tools. No funding disclosure or competing interests statement is provided." 334 }, 335 { 336 "flag": "Benchmark contamination risk unaddressed", 337 "detail": "The evaluator LLMs (Mistral, Mixtral, Llama2-Chat) were likely trained on data containing HumanEval and MBPP problems. Since CodeSift uses these LLMs to judge code correctness, familiarity with benchmark problems could artificially inflate evaluation accuracy. This is not discussed." 338 }, 339 { 340 "flag": "No code or data release", 341 "detail": "Neither the CodeSift implementation nor the contributed Bash dataset of 100 tasks is released, making independent verification impossible." 342 }, 343 { 344 "flag": "Overclaimed generalization", 345 "detail": "The title frames this as a general 'Automatic Code Validation' framework, but it is tested on only 2 programming languages (Python, Bash) with relatively simple self-contained programming problems. No complex multi-file or real-world codebase validation is attempted." 346 } 347 ], 348 "cited_papers": [ 349 { 350 "title": "Evaluating large language models trained on code", 351 "authors": ["Mark Chen", "Jerry Tworek"], 352 "year": 2021, 353 "arxiv_id": "2107.03374", 354 "relevance": "Introduces HumanEval benchmark and pass@k metric, foundational for LLM code generation evaluation." 355 }, 356 { 357 "title": "Ice-score: Instructing large language models to evaluate code", 358 "authors": ["Terry Yue Zhuo"], 359 "year": 2024, 360 "relevance": "Baseline method for LLM-based code evaluation using direct task-to-code comparison; directly compared against in this paper." 361 }, 362 { 363 "title": "Judging LLM-as-a-judge with MT-bench and chatbot arena", 364 "authors": ["Lianmin Zheng", "Wei-Lin Chiang"], 365 "year": 2023, 366 "relevance": "Introduces LLM-as-a-judge framework including Reference Grading, used as a baseline in this paper." 367 }, 368 { 369 "title": "G-eval: NLG evaluation using gpt-4 with better human alignment", 370 "authors": ["Yang Liu", "Dan Iter"], 371 "year": 2023, 372 "relevance": "LLM-based evaluation method for NLG that influenced prompt-based code evaluation approaches." 373 }, 374 { 375 "title": "Is your code generated by chatgpt really correct? Rigorous evaluation of large language models for code generation", 376 "authors": ["Jiawei Liu", "Chunqiu Steven Xia"], 377 "year": 2023, 378 "arxiv_id": "2305.01210", 379 "relevance": "Introduces EvalPlus and MBPP+ for rigorous LLM code generation evaluation, directly used as a dataset in this paper." 380 }, 381 { 382 "title": "An empirical evaluation of using large language models for automated unit test generation", 383 "authors": ["Max Schäfer", "Sarah Nadi"], 384 "year": 2024, 385 "relevance": "Empirical study of LLMs for automated test generation, relevant to understanding validation approaches for LLM-generated code." 386 }, 387 { 388 "title": "Codamosa: Escaping coverage plateaus in test generation with pre-trained large language models", 389 "authors": ["Caroline Lemieux", "Jeevana Priya Inala"], 390 "year": 2023, 391 "relevance": "LLM-augmented test generation approach that addresses scalability of execution-based code evaluation." 392 }, 393 { 394 "title": "CodeBertScore: Evaluating code generation with pretrained models of code", 395 "authors": ["Shuyan Zhou", "Uri Alon"], 396 "year": 2023, 397 "relevance": "Embedding-based code evaluation metric using pretrained code models, representing an alternative evaluation paradigm." 398 }, 399 { 400 "title": "Starcoder: may the source be with you!", 401 "authors": ["Raymond Li", "Loubna Ben Allal"], 402 "year": 2023, 403 "arxiv_id": "2305.06161", 404 "relevance": "Open-source code generation model used as one of the code generators in evaluation." 405 }, 406 { 407 "title": "Code llama: Open foundation models for code", 408 "authors": ["Baptiste Roziere", "Jonas Gehring"], 409 "year": 2023, 410 "arxiv_id": "2308.12950", 411 "relevance": "Open-source code LLM used as one of the code generation models in evaluation." 412 }, 413 { 414 "title": "Program synthesis with large language models", 415 "authors": ["Jacob Austin", "Augustus Odena"], 416 "year": 2021, 417 "arxiv_id": "2108.07732", 418 "relevance": "Introduces MBPP benchmark for program synthesis evaluation, foundational dataset used in this paper." 419 } 420 ] 421 }