scan.json (25274B)
1 { 2 "paper": { 3 "title": "A Comparative Study of DSL Code Generation: Fine-Tuning vs. Optimized Retrieval Augmentation", 4 "authors": ["Nastaran Bassamzadeh", "Chhaya Methani"], 5 "year": 2024, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2407.02742" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "No GitHub link, Zenodo archive, or any other code repository is mentioned in the paper. No code release is discussed." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "The paper describes a synthetically generated dataset of 67k training samples and 1000 test samples, but no download link or public release is provided. The datasets are proprietary to Microsoft." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper mentions using Azure AML pipelines and GPT-4 with 16k token limit but provides no requirements.txt, Dockerfile, library versions, or detailed environment setup information." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided. The methodology is described at a high level but lacks sufficient detail for independent reproduction." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "All results in Tables 1-4 are reported as single point estimates (delta improvements) with no confidence intervals, error bars, or uncertainty quantification." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper makes comparative claims (e.g., 'TST with FD setting performs overall better') based solely on comparing delta values without any statistical significance tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports percentage-point improvements over baselines with enough context to understand magnitude (e.g., 'hallucination rate for API names dropping by 6.29 pts' in Section 6, and all table results shown as delta improvements from named baselines)." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The test set has 1000 samples and the training set has 67k samples, but no justification is given for why these sizes were chosen and no power analysis is discussed." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No standard deviations, variance, or spread measures are reported. It is unclear whether results are from single runs or averaged over multiple runs. No mention of multiple experimental runs." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper uses a Pre-Trained model with 5 few-shots as baseline in Tables 1-3, and a fine-tuned Codex model as baseline in Table 4. Multiple ablation variants are compared." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": false, 68 "justification": "The fine-tuned model is based on Codex, which was deprecated by OpenAI in 2023. GPT-4 is used for RAG but no comparison against other contemporary code generation systems or approaches (e.g., Code Llama fine-tuning, StarCoder) is made. The baselines are internal ablation variants rather than competitive external systems." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper presents a systematic ablation study across Sections 5.1-5.3, varying: number of few-shots (5 vs 20), retrieval model (Pre-trained vs TST), and grounding type (no FD, FD, SFD, FD+SFD). Tables 1-4 show results for each ablation." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Four metrics are used: Average Similarity (LCSS-based), %Unparsed flows, %made-up API names, and %made-up parameter keys. Defined in Section 4.2." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "Human judges were used to validate the quality of the synthetic test dataset (Section 4.1), but no human evaluation of the system's generated DSL outputs was performed. All evaluation of system outputs is automated via compiler checking and similarity metrics." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "Section 4.1 describes a separate test set of 1000 samples distinct from the 67k training samples. The test set was generated independently and verified by human judges." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": false, 93 "justification": "Results are reported only as overall averages across the 1000-sample test set. No breakdown by API category, query complexity, or DSL length is provided, despite the paper mentioning roughly 700 APIs with varying frequencies (head vs. tail)." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 5.3 discusses cases where SFD increases hallucination rates ('adding too many API descriptions can confuse rather than help the LLM'). Section 6 discusses where the fine-tuned model struggles with syntax vs. where RAG struggles with hallucination. Appendix A.1 shows an example with a hallucinated API call." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Table 4 shows that SFD increases hallucination rates (e.g., +6.76 pts for made-up parameters). Section 5.3 discusses how 'the approach to simply add semantically similar API metadata for a query is not useful for DSL generation.' The paper reports that RAG still lags behind fine-tuning on hallucination metrics." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims RAG achieved parity on similarity metric (supported by Table 4 showing 0 delta), RAG was 2 pts better on compilation (supported by Table 4 showing -5.3 delta in unparsed flows), and hallucination lagged by 1 pt for API names and 2 pts for parameters (supported by Table 4 showing +1.7 and +1.11). All claims match the results." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The ablation study design uses controlled single-variable manipulation (varying one factor at a time: number of few-shots, retrieval model, grounding type) which is adequate for the causal claims made about which components contribute to performance (e.g., 'few-shot examples have been successfully teaching the correct syntax')." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper title says 'DSL Code Generation' broadly but results are limited to one specific automation DSL with ~700 APIs at Microsoft. Claims about RAG vs fine-tuning are specific to this DSL but the paper draws conclusions about 'NL2DSL generation' generally without adequately bounding them to this specific domain." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper does not discuss alternative explanations for its findings. For example, it does not consider whether the fine-tuned model's advantage in hallucination could be due to the specific data distribution rather than the approach itself, or whether GPT-4 vs Codex differences confound the RAG vs fine-tuning comparison." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper says 'GPT-4 (with 16k token limit)' and 'Codex base model' without specifying exact model versions, snapshot dates, or API versions. No version string like 'gpt-4-0613' is provided." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper describes 'metaprompt instructions' and 'metaprompt tuning' but does not provide the actual prompt text used. Only a high-level system architecture diagram (Figure 1) shows that prompts include grounding info, few-shot examples, and API definitions, without the actual text." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "No temperature, top-p, max tokens, or other sampling parameters are reported for GPT-4. For the Codex fine-tuning, LoRA-based approach is mentioned but no learning rate, epochs, LoRA rank, or other hyperparameters are specified. For BERT fine-tuning, only the loss function is described without training hyperparameters." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "The system is a single-pass RAG pipeline (retrieve, augment prompt, generate) without agentic scaffolding such as retry logic, multi-step planning, or feedback loops." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": false, 152 "justification": "The paper describes sampling 700 publicly available APIs and synthetically generating NL prompts using GPT-4 (Section 4.1), but important preprocessing details are missing: how workflows were sampled from users, filtering criteria, how the 67k training samples were generated vs the 1000 test samples, and what 'manual approval' entailed for test set validation." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "There is no dedicated limitations or threats-to-validity section. The paper jumps from Results (Section 5) to Conclusion and Future Work (Section 6) without discussing limitations." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No specific threats to validity are discussed anywhere in the paper. The paper does not address potential issues like synthetic test set quality, model version effects, or whether results would hold for different DSLs." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper mentions 'we constrained the length of sequence to 5 APIs' (Section 1) but does not explicitly state what the results do NOT show or what settings are excluded. There is no discussion of what claims the authors are NOT making." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "Neither the raw workflow data, synthetic NL-DSL pairs, nor the test set are publicly available for independent verification." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": false, 181 "justification": "The paper states 'we had many samples of workflow automations created by users' (Section 4.1) and sampled 700 publicly available APIs, but does not describe the specific collection procedure, time period, selection criteria for user workflows, or how 'publicly available' was determined." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants were recruited for a study. Human judges validated test set quality but this is dataset construction, not a human subjects study. The data comes from a standard product usage database." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": false, 191 "justification": "The pipeline from user workflows to synthetic NL-DSL pairs to final test/training sets is described at a high level but key steps are missing: how many workflows were initially available, how many were filtered and why, how the split between training (67k) and test (1000) was determined, and what percentage of test samples were validated by human judges." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding source or acknowledgments section is present in the paper. The authors are from Microsoft Corporation but no explicit funding disclosure is made." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly listed as 'Microsoft Corporation, Redmond, USA' on the first page." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "The authors are Microsoft employees evaluating approaches using Microsoft's Azure AML infrastructure and a DSL for Microsoft's automation platform (likely Power Automate). Microsoft has a financial interest in the outcome of research that validates their workflow automation technology. No independence from funder is established." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement or financial interests declaration is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper uses GPT-4 and Codex models without stating their training data cutoff dates. Since the test set contains synthetic NL-DSL pairs, it is unclear whether GPT-4 could have been exposed to similar API documentation or workflow patterns during training." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No analysis of potential overlap between GPT-4's training data and the test set is discussed. The DSL is custom but the APIs are publicly documented, creating potential contamination risk." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper does not address whether GPT-4 may have seen documentation for the 700 publicly available APIs used in the benchmark. Since these are public APIs, their documentation was likely in GPT-4's training data, which could give GPT-4 an advantage in understanding function names and parameters." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants were involved in a study. Human judges validated test set quality but were not study subjects." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human subjects study was conducted. Human judges validated data quality, which does not constitute a human subjects study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human subjects study was conducted." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human subjects study was conducted." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human subjects study was conducted." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human subjects study was conducted." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human subjects study was conducted." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No API costs, tokens consumed, or wall-clock time per example is reported, despite the system making multiple GPT-4 calls with varying numbers of few-shot examples (5 vs 20) and API function definitions." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total computational budget, GPU hours for fine-tuning, total API spend, or hardware specifications are reported. The paper mentions using Azure AML pipelines but provides no cost or compute details." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Fine-tuned Codex model scores best on code similarity metric for DSL generation.", 286 "evidence": "Table 4 uses fine-tuned model as baseline; RAG variants show 0 or -0.01 delta in average similarity, confirming parity but not superiority for RAG.", 287 "supported": "moderate" 288 }, 289 { 290 "claim": "Optimized RAG achieves parity with fine-tuned model on similarity metric.", 291 "evidence": "Table 4 shows TST + FD achieves 0 delta on average similarity compared to fine-tuned baseline.", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "RAG-based method has 2 pts better compilation rate than fine-tuned model.", 296 "evidence": "Table 4 shows TST + FD has -5.3 delta on %Unparsed flows compared to fine-tuned baseline (abstract says 2 pts, table shows 5.3 pts improvement, slight discrepancy).", 297 "supported": "weak" 298 }, 299 { 300 "claim": "Hallucination rate for RAG model lags behind fine-tuned model by 1 pt for API names and 2 pts for parameter keys.", 301 "evidence": "Table 4 shows TST + FD has +1.7 for made-up API names and +1.11 for made-up parameters vs fine-tuned baseline. Abstract says 1 pt and 2 pts respectively, which is approximately consistent but not exact.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "Adding more few-shot samples (20 vs 5) improves performance across all metrics.", 306 "evidence": "Table 1 shows improvements in all four metrics when increasing from 5 to 20 few-shots for both Pre-Trained and TST models.", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "Adding semantically similar API metadata (SFD) can confuse rather than help the LLM.", 311 "evidence": "Table 4 shows TST + SFD increases hallucination of parameter keys by +6.76 pts compared to fine-tuned baseline, worse than TST + FD (+1.11 pts).", 312 "supported": "moderate" 313 }, 314 { 315 "claim": "RAG hallucination rate for API names dropped by 6.29 pts with optimizations.", 316 "evidence": "Section 6 cites Table 2, which shows TST + FD achieves -6.29 delta in made-up API names compared to Pre-Trained baseline without FD.", 317 "supported": "strong" 318 } 319 ], 320 "methodology_tags": ["benchmark-eval", "case-study"], 321 "key_findings": "The paper compares fine-tuning (Codex with LoRA) and optimized RAG (GPT-4 with TST-based few-shot retrieval and API function definitions) for DSL code generation across ~700 APIs. Optimized RAG achieves parity with fine-tuning on code similarity but has slightly higher hallucination rates for API names and parameters. RAG with API function definitions significantly reduces parsing errors compared to fine-tuning. Adding too many semantically similar API definitions (SFD) can increase hallucination rather than reduce it.", 322 "red_flags": [ 323 { 324 "flag": "No uncertainty quantification", 325 "detail": "All results are reported as single point estimates with no confidence intervals, error bars, standard deviations, or significance tests. It is impossible to assess whether the reported differences (often 1-3 percentage points) are meaningful or within noise." 326 }, 327 { 328 "flag": "Confounded comparison: GPT-4 vs Codex", 329 "detail": "The RAG approach uses GPT-4 while the fine-tuned model uses Codex. These are fundamentally different models with different capabilities, making it impossible to attribute performance differences to RAG vs fine-tuning rather than to the underlying model quality." 330 }, 331 { 332 "flag": "Abstract-results discrepancy", 333 "detail": "The abstract states RAG is '2 pts better' on compilation rate, but Table 4 shows a 5.3 pt improvement. The abstract says hallucination lags 'by 2 pts for API parameter keys' but Table 4 shows +1.11 pts. These inconsistencies undermine confidence in the reporting." 334 }, 335 { 336 "flag": "No reproducibility artifacts", 337 "detail": "No code, data, prompts, or hyperparameters are released. The DSL, API definitions, training data, and test set are all proprietary to Microsoft, making independent verification impossible." 338 }, 339 { 340 "flag": "Company evaluating own product", 341 "detail": "Microsoft employees evaluate approaches for Microsoft's automation platform using Microsoft's Azure infrastructure. No conflict of interest disclosure is provided." 342 }, 343 { 344 "flag": "No limitations section", 345 "detail": "The paper lacks any discussion of limitations, threats to validity, or scope boundaries. No consideration of alternative explanations for the observed results." 346 }, 347 { 348 "flag": "Results reported only as deltas", 349 "detail": "All tables report delta improvements from baselines rather than absolute values. The actual baseline performance numbers are never stated, making it impossible to assess the absolute quality of any approach." 350 } 351 ], 352 "cited_papers": [ 353 { 354 "title": "Evaluating Large Language Models Trained on Code", 355 "authors": ["Mark Chen", "Jerry Tworek"], 356 "year": 2021, 357 "arxiv_id": "2107.03374", 358 "relevance": "Foundational work on code generation evaluation (Codex/HumanEval) - central to assessing LLM code generation capabilities." 359 }, 360 { 361 "title": "Synchromesh: Reliable code generation from pre-trained language models", 362 "authors": ["Gabriel Poesia", "Oleksandr Polozov"], 363 "year": 2022, 364 "arxiv_id": "2201.11227", 365 "relevance": "Introduces TST-based few-shot retrieval for improving code generation reliability, directly used as a technique in this paper." 366 }, 367 { 368 "title": "StarCoder: may the source be with you!", 369 "authors": ["Raymond Li", "Loubna Ben Allal"], 370 "year": 2023, 371 "arxiv_id": "2305.06161", 372 "relevance": "Open-source code generation model relevant to understanding the landscape of code LLMs and their evaluation." 373 }, 374 { 375 "title": "Gorilla: Large Language Model Connected with Massive APIs", 376 "authors": ["Shishir G. Patil", "Tianjun Zhang"], 377 "year": 2023, 378 "arxiv_id": "2305.15334", 379 "relevance": "Addresses LLM integration with large API sets, directly relevant to the challenge of API function name hallucination in code generation." 380 }, 381 { 382 "title": "PAL: Program-aided Language Models", 383 "authors": ["Luyu Gao", "Aman Madaan"], 384 "year": 2023, 385 "arxiv_id": "2211.10435", 386 "relevance": "Explores using code generation for reasoning tasks, connecting program synthesis with tool use - relevant to LLM programming capabilities." 387 }, 388 { 389 "title": "Improving ChatGPT Prompt for Code Generation", 390 "authors": ["Chao Liu", "Xuanlin Bao"], 391 "year": 2023, 392 "arxiv_id": "2305.08360", 393 "relevance": "Studies prompt engineering for code generation with LLMs, directly relevant to prompt-based approaches for improving code quality." 394 }, 395 { 396 "title": "Toolformer: Language Models Can Teach Themselves to Use Tools", 397 "authors": ["Timo Schick", "Jane Dwivedi-Yu"], 398 "year": 2023, 399 "arxiv_id": "2302.04761", 400 "relevance": "Foundational work on LLM tool use and integration, relevant to understanding how models interact with external APIs." 401 }, 402 { 403 "title": "HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face", 404 "authors": ["Yongliang Shen", "Kaitao Song"], 405 "year": 2023, 406 "arxiv_id": "2303.17580", 407 "relevance": "Task orchestration using LLMs across multiple APIs, related to the multi-API planning problem addressed in this paper." 408 }, 409 { 410 "title": "TaskMatrix.AI: Completing Tasks by Connecting Foundation Models with Millions of APIs", 411 "authors": ["Yaobo Liang", "Chenfei Wu"], 412 "year": 2023, 413 "arxiv_id": "2303.16434", 414 "relevance": "Addresses scaling LLM tool use to massive API sets, directly relevant to the challenge of DSL generation over hundreds of APIs." 415 }, 416 { 417 "title": "An Empirical Evaluation of GitHub Copilot's Code Suggestions", 418 "authors": ["Nhan Nguyen", "Sarah Nadi"], 419 "year": 2022, 420 "doi": "10.1145/3524842.3528470", 421 "relevance": "Empirical evaluation of AI code generation quality, relevant to understanding how LLM-generated code is assessed." 422 } 423 ] 424 }