scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (36197B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Exploring Parameter-Efficient Fine-Tuning Techniques for Code Generation with Large Language Models",
      6     "authors": [
      7       "M. Weyssow",
      8       "Xin Zhou",
      9       "Kisub Kim",
     10       "David Lo",
     11       "H. Sahraoui"
     12     ],
     13     "year": 2023,
     14     "venue": "ACM Transactions on Software Engineering and Methodology",
     15     "arxiv_id": "2308.10462",
     16     "doi": "10.1145/3714461"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims PEFT superiority over ICL and RAG, memory reduction with QLoRA, and broader applicability. All are supported by Tables 3-4 and Figures 1, 5-7. The abstract's claims about 'diverse set of LLMs' and 'three representative Python code generation datasets' accurately reflect the study scope.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper's causal claims (e.g., 'LoRA significantly enhances performance', PEFT 'outperforms' alternatives) are based on controlled comparisons where only the tuning technique varies while model, data, and hardware are held constant. This controlled experimental design is adequate for the claims made.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title says 'Code Generation' broadly while testing only Python. The conclusion claims PEFT 'opens opportunities for broader applications of PEFT in software engineering scenarios' without evidence for non-Python languages or non-generation tasks. The threats-to-validity section acknowledges the monolingual limitation but then speculates that 'PEFT is also applicable to other programming languages.'",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The Discussion section hypothesizes QLoRA-4bit's improvement 'stems from the regularization effect of reducing weight precision to 4 bits' (Section 6). The threats to validity section discusses hyperparameter sensitivity, model selection bias, and dataset representativeness as factors that could influence results.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper consistently frames its measurements (EM@k, CodeBLEU, Pass@k) as what they are — match-based and execution-based code generation metrics — without inflating them to broader claims about developer productivity or code quality. No proxy gap exists between measurements and claims.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 7 'Threats to Validity' provides a dedicated, substantive discussion organized by external, internal, and construct validity.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats are discussed: model selection limited to open-source (Section 7, external), monolingual Python-only evaluation (external), hyperparameter values taken from prior work without exhaustive tuning (internal), CodeBLEU's reliance on dataflow graphs not always available for small examples (construct), lack of unit tests in Conala/CodeAlpacaPy (construct).",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly states: closed-source models excluded due to parameter inaccessibility, full fine-tuning infeasible for LLMs within 24GB GPU constraint, HumanEval/MBPP excluded for lacking training data, Python only (acknowledging monolingual limitation), and no larger models beyond QLoRA experiments.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding information, acknowledgments section, or grant numbers appear in the paper.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly listed: DIRO, University of Montreal (Weyssow, Sahraoui) and Singapore Management University (Zhou, Kim, Lo). These are academic institutions with no apparent conflict with the evaluated open-source models.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Without funding disclosure, independence of the funder cannot be assessed. The authors are at academic institutions and evaluate open-source models they did not develop, suggesting likely independence, but this is not explicitly stated.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or financial interest declaration appears in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "LLMs (≥1B parameters), SLMs (<1B parameters), PEFT, ICL, RAG, LoRA, QLoRA, IA3, Prompt tuning, and Prefix tuning are all explicitly defined in Sections 2-3.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three explicit contributions are listed at the end of Section 1: a comprehensive PEFT empirical study for LLMs, a comparative analysis against ICL/RAG, and a demonstration of resource-constrained fine-tuning practicality.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 8 explicitly contrasts the work against prior PEFT studies on SLMs (CodeBERT, CodeT5) and positions the paper as the first comprehensive study of PEFT on LLMs (≥1B) for code generation in SE.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper states 'We make our code publicly available: https://github.com/martin-wey/peft-llm-code' in Section 4.6.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All three datasets are public: Conala (curated version from Zhou et al. 2023), CodeAlpaca (Chaudhary 2023), and APPS (Hendrycks et al. 2021). The authors constructed CodeAlpacaPy by filtering Python samples from CodeAlpaca and release their code which includes this filtering.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper mentions using HuggingFace and PEFT libraries and an NVIDIA RTX A5000 24GB GPU, but does not provide library versions, a requirements.txt, Dockerfile, or detailed environment specification sufficient to recreate the software environment.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "While the paper provides detailed methodology in Section 4.6 and releases code on GitHub, the paper itself does not include step-by-step reproduction instructions or commands to replicate results.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Tables 3, 4 and Figures 3-7 are reported as point estimates only. No confidence intervals, error bars, or ± notation appear anywhere in the paper.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper uses language like 'significantly outperform' and 'consistently outperform' throughout but performs no statistical significance tests. All comparative claims are based solely on numerical differences between point estimates.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "The paper consistently reports relative improvements with baseline context, e.g., 'surpasses the best small model by 39.8%, 41.7%, and 47.1%' (Section 5.2), '23.1% improvement in EM@10 on Conala (36.28 for LoRA vs. 29.47 for ICL)' (Section 5.3), and 'QLoRA-4bit results in a notable 52% increase' (Section 5.4).",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No justification is provided for why the dataset sizes (Conala: 543 test, CodeAlpacaPy: 628 test, APPS: 750 test) are sufficient for the claims being made. No power analysis is discussed.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or spread measures are reported for any experiment. Results appear to be from single runs with no indication of variability across seeds or runs.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Comprehensive baselines are included: zero-shot, ICL with varying numbers of examples, RAG with GTE-small retrieval (Section 4.3), full fine-tuning for SLMs, and multiple PEFT techniques compared against each other.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines include CodeLlama (2023), CodeGen2 (2023), and CodeT5+ (2023), which were recent models at the time of writing. RAG uses GTE-small, a competitive embedding model.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "The study systematically compares LoRA, IA3, Prompt tuning, Prefix tuning, QLoRA-8bit, and QLoRA-4bit across the same models and datasets, effectively serving as an ablation across techniques. Effect of quantization precision (8-bit vs 4-bit) and number of ICL examples are also systematically varied.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Conala and CodeAlpacaPy use EM@1, EM@10, and CodeBLEU. APPS uses average test cases passed and Pass@k (k=1,2,5). Multiple metrics reported throughout.",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "All evaluation is entirely automated (exact match, CodeBLEU, test case pass rates). No human evaluation of generated code quality is performed.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "All three datasets have separate train/validation/test splits. Section 4.6 states 'We selected the checkpoint with the lowest evaluation loss for inference,' confirming model selection on validation, reporting on test.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by model, by PEFT technique, by dataset, and for APPS by difficulty level (introductory, interview, competition) in Table 4. Table 3 provides per-model per-technique breakdowns.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "The paper discusses where approaches fail: Prefix tuning 'fails to effectively adapt the larger models' (Section 5.2, some configurations show 0.0 EM@10 in Table 3), RAG underperforms ICL on CodeAlpacaPy (Section 5.3), and improvements are 'less substantial for interview and competition-level code generation' (Section 5.4).",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Several negative results are reported: Prefix tuning fails for larger models (0.0 EM scores in Table 3), increasing ICL examples beyond a threshold degrades performance (Section 5.1), RAG yields lower EM@10 than ICL on CodeAlpacaPy (Section 5.3), and QLoRA-8bit sometimes underperforms LoRA (Table 4).",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Specific model names and sizes are given: CodeLlama-7B, CodeLlama-7B-Instruct, CodeLlama-7B-Python, CodeLlama-13B-Python, CodeLlama-34B-Python, CodeGen2-1B/3.7B/7B, CodeGen-350M-mono, CodeT5+-220M/770M. For open-source models with fixed checkpoints, these names uniquely identify the model weights.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Table 2 shows the full prompt template with actual examples: '### Instruction: [intent] ### Response:'. The paper also describes the ICL prompt construction where examples are concatenated with the same template format.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Section 4.6 provides detailed hyperparameters: learning rates (5e-5 for full FT, 3e-4 for LoRA/IA3/QLoRA, 3e-3/3e-2 for Prompt/Prefix tuning), r=16, alpha=32, 20 virtual tokens, Adafactor optimizer, 16-bit float, 5 epochs, batch size 8, beam search with beam size 10, max token lengths (64/128/1024).",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used. This is standard fine-tuning and inference with direct model prompting.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 4.2 describes CodeAlpacaPy construction (filtering Python samples from CodeAlpaca, removing syntactically invalid code), Conala curation (ensured no function overlap between splits, no same-post overlap), and APPS split (4500/500/750 with 250 test samples per difficulty). Train/val/test sizes are stated for all datasets.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "While the input datasets are public and code is released, raw experimental outputs (model predictions, per-example scores) are not made available for independent verification.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 4.2 describes how each dataset was collected: Conala crawled from StackOverflow with manual annotations, CodeAlpacaPy filtered from CodeAlpaca for Python with syntactic validation, APPS from coding competitions with difficulty categorization.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. All data comes from standard public benchmarks (Conala from StackOverflow, CodeAlpaca from LLM-generated data, APPS from coding competitions).",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The data pipeline is documented: CodeAlpacaPy was constructed by filtering Python samples from CodeAlpaca and removing syntactically invalid code. Dataset splits are specified with exact counts (e.g., Conala: 2,135/201/543). The Conala curation process ensuring no overlap between splits is described.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoff dates are stated for any of the models used (CodeLlama, CodeGen2, CodeGen, CodeT5+). The paper mentions CodeLlama is based on Llama 2 and CodeGen2 was pre-trained on TheStack, but does not state when training data was collected.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "The paper discusses train/test split integrity for fine-tuning (Conala curated to avoid function overlap), but does not discuss whether pretrained models may have seen test examples during pretraining. Conala (StackOverflow) and APPS (competition problems) are publicly available and could appear in pretraining corpora.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "All three benchmarks (Conala from 2018, CodeAlpaca from 2023, APPS from 2021) were available online before the models' training periods. The paper does not discuss whether models may have seen benchmark solutions during pretraining.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study. All experiments are automated benchmark evaluations of code generation models.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants. The study evaluates models on existing public benchmarks.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants involved.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants involved.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants involved.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants involved.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants involved.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "The paper focuses extensively on training memory consumption (Figures 1 and 5) but does not report inference cost, latency, or time per example for any configuration.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "The paper states hardware used (single NVIDIA RTX A5000 24GB GPU) and peak memory consumption, but does not report total GPU hours, wall-clock training time, or total computational budget across all experiments.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "No seed sensitivity analysis is reported. For ICL, the paper mentions 'selecting random few-shot examples using different seeds' (Section 8) but does not report results across seeds. Fine-tuning results appear to be single-seed.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The number of experimental runs is never stated. Results appear to be from single runs with no indication of how many trials produced them.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "No hyperparameter search budget is reported. Section 7 (internal validity) states 'we used hyperparameters values which have been used in previous work' but does not report whether any search was performed or how many configurations were tried.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": true,
    392           "justification": "Section 4.6 clearly states: 'We selected the checkpoint with the lowest evaluation loss for inference,' which is a principled selection on the validation set, not the test set.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": false,
    397           "answer": false,
    398           "justification": "No statistical significance tests are performed at all, so multiple comparison correction is not applicable.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The authors implement all techniques (LoRA, IA3, Prompt tuning, Prefix tuning, ICL, RAG) themselves using the PEFT library. They do not acknowledge potential bias in their own implementations of these methods or discuss whether their ICL/RAG implementations are competitive.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": true,
    410           "justification": "Table 1 explicitly categorizes techniques by computation cost vs effectiveness. Figure 5 shows EM@10 alongside GPU peak memory for different quantization levels. Figure 1 compares memory consumption across fine-tuning approaches. The computation-effectiveness tradeoff is a central theme.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "The paper does not discuss whether Conala, CodeAlpacaPy, or APPS actually measure meaningful code generation capability. EM@k requires exact string match to reference solutions, a very narrow proxy for code quality, but this limitation is not discussed. The construct validity threats in Section 7 address only metric choice, not benchmark validity.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": false,
    421           "answer": false,
    422           "justification": "No scaffolding is involved. All models are evaluated with the same direct inference setup (prompt in, code out), so no scaffold confound exists.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "Not discussed. Conala (2018) and APPS (2021) predate the models used (CodeLlama and CodeGen2, both 2023). Models could have seen benchmark solutions during pretraining, but temporal leakage is not mentioned.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "Not discussed. The Conala dataset includes variable hints in natural language intents (e.g., 'map two lists keys and values into a dictionary'), which provide ground-truth variable names, but this is not analyzed as potential feature leakage.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "The paper notes Conala's curated version ensures no function overlap between fine-tuning splits and no same-post overlap, but does not address whether pretrained models' training corpora overlap with benchmark data. StackOverflow data (Conala) is commonly included in code pretraining corpora.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "No concrete leakage detection or prevention method is applied. No canary strings, membership inference tests, n-gram overlap analysis, or decontamination pipelines are used.",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "LLMs fine-tuned with PEFT consistently outperform SLMs with full fine-tuning by 39.8–72.3% in EM@k under the same 24GB GPU constraint.",
    457       "evidence": "Table 3 shows CodeLlama-7B-Python with LoRA achieving EM@10=36.28 on Conala vs CodeGen-350M-mono full FT at 18.42; percentage differences stated in RQ2 answer.",
    458       "supported": "strong"
    459     },
    460     {
    461       "claim": "LoRA outperforms ICL and RAG for all CodeLlama-7B variants on both Conala and CodeAlpacaPy.",
    462       "evidence": "Figure 6 and 7 show LoRA EM@10=39.31 vs ICL=29.83 and RAG=35.17 for CodeLlama-7B on Conala; consistent pattern across all three CodeLlama variants.",
    463       "supported": "strong"
    464     },
    465     {
    466       "claim": "QLoRA-4bit achieves approximately 2x reduction in peak GPU memory compared to LoRA while maintaining or improving effectiveness.",
    467       "evidence": "Figure 5 shows CodeLlama-7B-Python LoRA at ~19GB vs QLoRA-4bit at ~9.16GB (Figure 1); EM@10 improves from 36.28 to 37.8 on Conala.",
    468       "supported": "strong"
    469     },
    470     {
    471       "claim": "LoRA outperforms full fine-tuning for SLMs, with CodeGen-350M-mono LoRA EM@10 at 25.60 vs full FT at 18.42 on Conala.",
    472       "evidence": "Table 3 shows LoRA consistently outperforming full FT for all three SLMs on both datasets; authors note this contrasts NLP findings.",
    473       "supported": "strong"
    474     },
    475     {
    476       "claim": "Increasing ICL examples beyond an optimal point (8 for Conala, 4 for CodeAlpacaPy) degrades model effectiveness.",
    477       "evidence": "Figure 3 shows EM@10 peaking then declining as examples increase for most models; explicitly stated in RQ1 answer.",
    478       "supported": "moderate"
    479     },
    480     {
    481       "claim": "QLoRA-4bit enables fine-tuning of CodeLlama-34B-Python within 24GB GPU, achieving 12.2% EM@10 improvement over CodeLlama-7B-Python with LoRA on Conala.",
    482       "evidence": "Figure 5 shows CodeLlama-34B-Python QLoRA-4bit at EM@10=40.70 vs CodeLlama-7B-Python LoRA at 36.28, with peak memory at 23.59GB.",
    483       "supported": "strong"
    484     }
    485   ],
    486   "methodology_tags": [
    487     "benchmark-eval"
    488   ],
    489   "key_findings": "PEFT techniques, particularly LoRA, consistently outperform both ICL and RAG for Python code generation across multiple LLM families, with LLMs under PEFT surpassing fully fine-tuned SLMs by 39.8–72.3% in EM@k under the same GPU budget. QLoRA-4bit achieves approximately 2x memory reduction while matching or exceeding LoRA effectiveness, enabling fine-tuning of 34B-parameter models on a single 24GB GPU. Notably, PEFT outperforms full fine-tuning even for SLMs, contrasting with prior NLP findings. Prefix tuning fails almost entirely for LLMs, while RAG underperforms random ICL on complex code generation tasks.",
    490   "red_flags": [
    491     {
    492       "flag": "No statistical significance tests",
    493       "detail": "All comparative claims (e.g., 'LoRA significantly enhances') are made without any statistical tests (t-test, Wilcoxon, etc.). Differences are treated as meaningful based on magnitude alone."
    494     },
    495     {
    496       "flag": "Single-run results, no variance",
    497       "detail": "All EM@k and CodeBLEU results in Table 3 are single runs with no standard deviation across seeds or runs reported, making it impossible to assess result stability."
    498     },
    499     {
    500       "flag": "Contamination unaddressed",
    501       "detail": "Pre-trained models (CodeLlama, CodeGen2) were trained on large code corpora before the study; whether Conala or APPS test examples appeared in training data is never discussed."
    502     },
    503     {
    504       "flag": "No full fine-tuning LLM baseline",
    505       "detail": "The paper cannot compare PEFT to full fine-tuning for LLMs (its primary comparison class) due to resource constraints, leaving the central efficiency claim unverified against the strongest baseline."
    506     },
    507     {
    508       "flag": "Narrow scope claimed as comprehensive",
    509       "detail": "The study is limited to Python code generation on three datasets with specific open-source models, but is described as 'comprehensive' throughout the abstract and conclusions."
    510     }
    511   ],
    512   "cited_papers": [
    513     {
    514       "title": "Evaluating Large Language Models Trained on Code (Codex / HumanEval)",
    515       "relevance": "Foundational LLM code generation paper; establishes HumanEval benchmark and zero-shot code generation capability baseline"
    516     },
    517     {
    518       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    519       "relevance": "Core PEFT technique evaluated in this paper; establishes the low-rank matrix injection approach"
    520     },
    521     {
    522       "title": "QLoRA: Efficient Finetuning of Quantized LLMs",
    523       "relevance": "Second main PEFT technique evaluated; combines LoRA with 4/8-bit quantization for memory reduction"
    524     },
    525     {
    526       "title": "Code Llama: Open Foundation Models for Code",
    527       "relevance": "Best-performing model family in the study; establishes baseline for code-specialized LLMs"
    528     },
    529     {
    530       "title": "Measuring Coding Challenge Competence With APPS",
    531       "relevance": "One of three evaluation datasets used; provides execution-based functional correctness evaluation"
    532     },
    533     {
    534       "title": "Delta Tuning: A Comprehensive Study of Parameter Efficient Methods for Pre-trained Language Models",
    535       "relevance": "Prior comprehensive PEFT study for NLP models; contrasted against because it found full fine-tuning superior, whereas this paper finds the opposite for SLMs in SE"
    536     },
    537     {
    538       "title": "Few-Shot Parameter-Efficient Fine-Tuning is Better and Cheaper than In-Context Learning (IA3)",
    539       "relevance": "Introduces IA3 PEFT technique evaluated in this paper; also makes claims about PEFT vs ICL tradeoffs"
    540     },
    541     {
    542       "title": "CodeT5+: Open Code Large Language Models for Code Understanding and Generation",
    543       "relevance": "SLM family evaluated in study; establishes encoder-decoder architecture baseline for code generation"
    544     },
    545     {
    546       "title": "Learning to Mine Aligned Code and Natural Language Pairs from Stack Overflow (Conala)",
    547       "relevance": "Source of the Conala dataset, one of three evaluation benchmarks in this study"
    548     },
    549     {
    550       "title": "Docprompting: Generating Code by Retrieving the Docs",
    551       "relevance": "Prior RAG-for-code-generation study using Conala; provides RAG baseline comparison context"
    552     }
    553   ],
    554   "engagement_factors": {
    555     "practical_relevance": {
    556       "score": 3,
    557       "justification": "Directly addresses the real constraint of limited GPU memory for practitioners wanting to fine-tune LLMs, with concrete recipes for running 34B models on a single 24GB GPU."
    558     },
    559     "surprise_contrarian": {
    560       "score": 2,
    561       "justification": "Finding that PEFT outperforms full fine-tuning for SLMs contradicts prior NLP findings (Ding et al.), and that RAG underperforms random ICL on complex tasks challenges the RAG hype."
    562     },
    563     "fear_safety": {
    564       "score": 0,
    565       "justification": "No AI risk, safety, or misuse concerns addressed; purely a techniques comparison study."
    566     },
    567     "drama_conflict": {
    568       "score": 0,
    569       "justification": "No controversy; straightforward empirical comparison with no competing claims or disputes."
    570     },
    571     "demo_ability": {
    572       "score": 2,
    573       "justification": "Code is publicly available on GitHub with open-source models and public datasets; a practitioner could reproduce experiments or adapt the pipeline."
    574     },
    575     "brand_recognition": {
    576       "score": 0,
    577       "justification": "Academic labs (University of Montreal, Singapore Management University) with no major industry affiliation; no famous product evaluated."
    578     }
    579   },
    580   "hn_data": {
    581     "threads": [
    582       {
    583         "hn_id": "32632312",
    584         "title": "Exploring the Role of the Cybercrime Underground in the Russia-Ukraine Conflict",
    585         "points": 4,
    586         "comments": 0,
    587         "url": "https://news.ycombinator.com/item?id=32632312",
    588         "created_at": "2022-08-28T21:36:55Z"
    589       },
    590       {
    591         "hn_id": "35662520",
    592         "title": "Learning to Program with Natural Language",
    593         "points": 3,
    594         "comments": 2,
    595         "url": "https://news.ycombinator.com/item?id=35662520",
    596         "created_at": "2023-04-22T01:45:40Z"
    597       },
    598       {
    599         "hn_id": "37866902",
    600         "title": "Getting Bored of Cyberwar",
    601         "points": 3,
    602         "comments": 1,
    603         "url": "https://news.ycombinator.com/item?id=37866902",
    604         "created_at": "2023-10-13T05:03:06Z"
    605       },
    606       {
    607         "hn_id": "37232173",
    608         "title": "GPT-NER: Named Entity Recognition via Large Language Models",
    609         "points": 3,
    610         "comments": 0,
    611         "url": "https://news.ycombinator.com/item?id=37232173",
    612         "created_at": "2023-08-23T05:23:52Z"
    613       },
    614       {
    615         "hn_id": "37168933",
    616         "title": "Fast as Chita: Neural Network Pruning with Combinatorial Optimization",
    617         "points": 2,
    618         "comments": 0,
    619         "url": "https://news.ycombinator.com/item?id=37168933",
    620         "created_at": "2023-08-17T22:16:16Z"
    621       },
    622       {
    623         "hn_id": "35984221",
    624         "title": "SLiC-HF: Sequence Likelihood Calibration with Human Feedback",
    625         "points": 2,
    626         "comments": 0,
    627         "url": "https://news.ycombinator.com/item?id=35984221",
    628         "created_at": "2023-05-18T04:48:32Z"
    629       },
    630       {
    631         "hn_id": "35263649",
    632         "title": "A comprehensive capacity analysis of GPT-3 and GPT-3.5 models",
    633         "points": 2,
    634         "comments": 0,
    635         "url": "https://news.ycombinator.com/item?id=35263649",
    636         "created_at": "2023-03-22T16:39:00Z"
    637       },
    638       {
    639         "hn_id": "37232871",
    640         "title": "Vanilla Transformer SOTA for Traffic Forecasting [pdf]",
    641         "points": 1,
    642         "comments": 0,
    643         "url": "https://news.ycombinator.com/item?id=37232871",
    644         "created_at": "2023-08-23T07:33:46Z"
    645       },
    646       {
    647         "hn_id": "37958375",
    648         "title": "Revealing the structure of language model capabilities",
    649         "points": 1,
    650         "comments": 0,
    651         "url": "https://news.ycombinator.com/item?id=37958375",
    652         "created_at": "2023-10-20T16:40:14Z"
    653       },
    654       {
    655         "hn_id": "35670419",
    656         "title": "Fully Autonomous Programming with Large Language Models",
    657         "points": 1,
    658         "comments": 0,
    659         "url": "https://news.ycombinator.com/item?id=35670419",
    660         "created_at": "2023-04-22T20:05:33Z"
    661       }
    662     ],
    663     "top_points": 4,
    664     "total_points": 22,
    665     "total_comments": 3
    666   }
    667 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs