scan-v5.json (27550B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Exploring Parameter-Efficient Fine-Tuning Techniques for Code Generation with Large Language Models", 6 "authors": [ 7 "M. Weyssow", 8 "Xin Zhou", 9 "Kisub Kim", 10 "David Lo", 11 "H. Sahraoui" 12 ], 13 "year": 2023, 14 "venue": "ACM Transactions on Software Engineering and Methodology", 15 "arxiv_id": "2308.10462", 16 "doi": "10.1145/3714461" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "Abstract claims PEFT superiority over ICL/RAG and QLoRA memory reduction are directly supported by Tables 3-4 and Figures 5-7 showing EM@k and GPU memory results across all model families.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Claims like 'LoRA improves effectiveness' are supported by controlled comparisons holding models constant and varying technique across identical datasets and splits; the design is adequate for comparative causal claims.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": true, 35 "justification": "The paper explicitly bounds claims to Python code generation, single-GPU resource constraint, and the specific model families tested; Threats to Validity (Section 7) explicitly flags the monolingual limitation and restricted model selection.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "The main finding that PEFT beats ICL/RAG is not accompanied by discussion of alternative explanations (e.g., whether optimized ICL example selection would close the gap); only the QLoRA-4bit improvement mentions a hypothesis (regularization effect).", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper uses EM@k, CodeBLEU, and Pass@k as proxies for code generation quality and explicitly notes in Section 5.3 the distinction between EM (requiring exact match) and CodeBLEU (rewarding near-correct solutions), clarifying what each metric captures.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section 7 'Threats to Validity' contains dedicated subsections for external, internal, and construct validity with multiple specific threats discussed.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": true, 61 "justification": "Specific threats include: Python-only datasets limiting multilingual generalizability, hyperparameter choices based on prior work without sensitivity analysis, and EM@k not capturing execution correctness for Conala/CodeAlpacaPy.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": true, 67 "justification": "The paper explicitly states it excludes closed-source models, excludes full fine-tuning for LLMs due to resource constraints, and notes that combining ICL/RAG with fine-tuned LLMs was not explored.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No funding acknowledgment or disclosure appears anywhere in the provided paper text.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "All five authors' institutional affiliations (University of Montreal, Singapore Management University) are disclosed on the title page with email addresses.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": false, 86 "answer": false, 87 "justification": "Funding is not disclosed, making independence assessment impossible.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests or financial interests statement appears in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "PEFT, ICL, RAG, LLM (≥1B parameters), and SLM (<1B parameters) are all explicitly defined in Sections 1-2 with precise parameter-count boundaries and technical descriptions.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section 1 explicitly lists three contributions: comprehensive empirical study of 6 PEFT techniques for LLMs in code generation, comparison against ICL/RAG, and demonstration of practicality under limited resources.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 8 explicitly distinguishes this work from prior PEFT studies by noting they focused on SLMs (<0.25B parameters) and explicitly claims this is 'among the first comprehensive exploration of PEFT techniques for LLMs in software engineering.'", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": true, 124 "justification": "Code is publicly available at https://github.com/martin-wey/peft-llm-code, mentioned in Section 4.6.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "Conala, APPS, and CodeAlpaca are all publicly available datasets; CodeAlpacaPy is a filtered subset of CodeAlpaca described in sufficient detail to reproduce.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "Only the GPU model (NVIDIA RTX A5000 24GB) and library names (HuggingFace, PEFT) are mentioned; no requirements.txt, Dockerfile, or versioned dependency list is provided.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "No step-by-step reproduction instructions appear in the paper; the hyperparameters are listed but no runnable workflow or README-equivalent is described.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "All results in Tables 3-4 and Figures 3-7 are reported as single point estimates with no confidence intervals or error bars.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are applied to any comparative claims despite the paper making numerous 'X outperforms Y' conclusions across all four RQs.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Percentage improvements are reported with baselines (e.g., 'best LLM surpasses best small model by 39.8–72.3% in EM@k', 'QLoRA-4bit boosting average passed tests by 52%') providing effect size context.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "Dataset sizes are described but no power analysis or justification for why 543/628/750 test examples are sufficient to detect the observed effect sizes is provided.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "All reported EM@k and CodeBLEU scores are single values with no standard deviation, variance, or multi-run averaging reported.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Zero-shot, ICL (random), RAG, and full fine-tuning for SLMs are all used as baselines against PEFT techniques.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "CodeLlama (2023), CodeGen2 (2023), and CodeT5+ (2023) are all recent model families; RAG uses GTE-small described as outperforming OpenAI embeddings.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "The systematic comparison across LoRA, IA3, Prompt tuning, Prefix tuning, QLoRA-8bit, and QLoRA-4bit effectively ablates the contribution of each PEFT design choice.", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "EM@1, EM@10, CodeBLEU are used for Conala/CodeAlpacaPy; average test cases passed and Pass@k (k=1,2,5) are used for APPS.", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": false, 205 "answer": false, 206 "justification": "Automated code generation benchmarks with ground truth make human evaluation not clearly required for the claims made; the paper focuses on match-based and execution-based correctness.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "All three datasets have explicit train/validation/test splits; Conala 2135/201/543, CodeAlpacaPy 2192/314/628, APPS 4500/500/750.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Table 4 breaks APPS results into introductory, interview, and competition difficulty levels; model family breakdowns across SLMs and LLMs are provided throughout.", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": false, 224 "justification": "The paper notes that improvements are 'less substantial for interview and competition-level tasks' and that Prefix tuning 'fails to effectively adapt larger models,' but no specific failure case examples are shown.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "Negative results are clearly reported: Prefix tuning fails for larger LLMs, RAG underperforms ICL on complex CodeAlpacaPy, and PEFT gains are minimal for competition-level APPS problems.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "Specific model variants are named: CodeGen-350M-mono, CodeT5+-220M/770M, CodeGen2-1B/3.7B/7B, CodeLlama-7B/7B-Instruct/7B-Python/13B-Python/34B-Python with exact parameter counts.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": true, 244 "justification": "Table 2 shows the actual prompt template with '### Instruction:' and '### Response:' delimiters plus three concrete examples from each dataset.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Section 4.6 reports learning rates (5e-5 for full FT, 3e-4 for LoRA/IA3/QLoRA, 3e-3 for Prompt tuning, 3e-2 for Prefix tuning), LoRA rank r=16, alpha=32, 20 virtual tokens, batch size 8, 5 epochs, beam size 10.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": false, 255 "answer": false, 256 "justification": "This is a fine-tuning/inference study with no agentic scaffolding; the question is not applicable.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "CodeAlpacaPy construction is described (filtering for Python, static parsing for syntactic validity); Conala curation is described (ensuring StackOverflow post separation across splits, function uniqueness).", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": true, 270 "justification": "All three datasets (Conala, APPS, CodeAlpaca) are publicly available; the filtered CodeAlpacaPy subset is derivable from the public CodeAlpaca dataset using the described procedure.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "Section 4.2 describes each dataset's origin: Conala crawled from StackOverflow with manual annotation, APPS from competitive programming, CodeAlpacaPy filtered from CodeAlpaca for syntactically valid Python.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants; all data from standard benchmarks and code repositories.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "The full pipeline from dataset selection through train/val/test splitting, preprocessing, fine-tuning, and evaluation is described in Sections 4.2-4.6.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "No training data cutoff dates are stated for CodeLlama, CodeGen2, or CodeT5+ despite these models having known pre-training corpora that may overlap with benchmark datasets.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": false, 302 "justification": "Intra-dataset train/test overlap is addressed for Conala, but whether model pre-training data (TheStack, code data) contains the APPS, Conala, or CodeAlpaca test examples is not discussed.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": false, 308 "justification": "Conala (2018), APPS (2021), and CodeAlpaca (2023) were available before CodeLlama and CodeGen2 training cutoffs; this potential contamination is not addressed.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants in this study.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants in this study.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants in this study.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants in this study.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants in this study.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants in this study.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants in this study.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": true, 360 "justification": "Peak GPU memory consumption during inference and fine-tuning is reported in Figure 1 for all model configurations, which is the primary resource constraint discussed.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": true, 366 "justification": "The entire study is explicitly conducted under a single NVIDIA RTX A5000 24GB GPU constraint, stated as the computational budget in Section 4.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "PEFT techniques (LoRA, IA3) consistently outperform ICL for LLMs on code generation", 375 "evidence": "Figure 6 shows all models fine-tuned with LoRA achieve significantly higher EM@10 than their ICL counterparts on both Conala and CodeAlpacaPy; CodeLlama-7B-Python LoRA achieves 36.28 vs 29.47 ICL EM@10 on Conala (23.1% improvement)", 376 "supported": "strong" 377 }, 378 { 379 "claim": "LLMs fine-tuned with PEFT outperform SLMs fully fine-tuned by 39.8–72.3% in EM@k", 380 "evidence": "Table 3 shows best LLM (CodeLlama-7B-Python with LoRA) vs best SLM (CodeGen-350M-mono with LoRA): 39.8–72.3% improvement in EM@k on Conala and CodeAlpacaPy under same 24GB GPU constraint", 381 "supported": "strong" 382 }, 383 { 384 "claim": "QLoRA-4bit reduces peak GPU memory up to 2x versus LoRA while maintaining effectiveness", 385 "evidence": "Figure 1 shows CodeLlama-7B-Python: LoRA uses 19.06GB, QLoRA-4bit uses 9.16GB (2x reduction); Figure 5 shows QLoRA-4bit achieves 40.70 EM@10 vs LoRA's 36.28 on Conala for CodeLlama-34B", 386 "supported": "strong" 387 }, 388 { 389 "claim": "LoRA outperforms RAG for code generation on both datasets across all CodeLlama variants", 390 "evidence": "Figure 7 shows CodeLlama-7B achieves 39.31 EM@10 with LoRA vs 35.17 with RAG (best) vs 29.83 with ICL on Conala; similar pattern holds for CodeAlpacaPy", 391 "supported": "strong" 392 }, 393 { 394 "claim": "PEFT outperforms full fine-tuning for SLMs, contrasting with NLP findings", 395 "evidence": "Table 3 shows CodeGen-350M-mono LoRA achieves 25.60 EM@10 on Conala vs 18.42 for full fine-tuning; similar patterns for CodeT5+ variants. Authors note this contrasts with Ding et al.'s NLP finding that full fine-tuning is superior", 396 "supported": "strong" 397 }, 398 { 399 "claim": "Prefix tuning fails to effectively adapt larger LLMs to code generation datasets", 400 "evidence": "Table 3 shows Prefix tuning yields 0.0 EM@1 and 0.16–0.32 EM@10 on CodeAlpacaPy for CodeGen2-7B, CodeLlama variants, and all models ≥3.7B, while LoRA achieves 7–8% EM@1 on the same models", 401 "supported": "strong" 402 } 403 ], 404 "methodology_tags": [ 405 "benchmark-eval", 406 "observational" 407 ], 408 "key_findings": "PEFT techniques, particularly LoRA, consistently outperform both ICL and RAG for Python code generation across 11 LLMs and SLMs tested under a single 24GB GPU constraint. LLMs fine-tuned with PEFT surpass fully fine-tuned SLMs by 39–72% in EM@k, and PEFT also beats full fine-tuning for SLMs (contrasting with NLP literature). QLoRA-4bit enables fine-tuning of 34B parameter models within a 24GB GPU while achieving comparable or superior performance to LoRA, and Prefix tuning consistently fails for models above 3.7B parameters. Benchmark contamination from model pre-training data is unaddressed, and no statistical significance tests are applied to any comparative claims.", 409 "red_flags": [ 410 { 411 "flag": "No statistical significance tests", 412 "detail": "All comparative claims ('LoRA significantly enhances', 'consistently outperforms') are made without any statistical tests; single-run point estimates are reported throughout Tables 3-4." 413 }, 414 { 415 "flag": "No variance across runs", 416 "detail": "No standard deviation or multi-run results are reported; fine-tuning with random initialization and dataset sampling introduces variance that is unmeasured." 417 }, 418 { 419 "flag": "Benchmark contamination unaddressed", 420 "detail": "Conala (2018), APPS (2021), and CodeAlpaca (2023) predate the training cutoffs of CodeLlama and CodeGen2; potential test data leakage into model pre-training is never discussed." 421 }, 422 { 423 "flag": "ICL baseline potentially weak", 424 "detail": "ICL uses randomly selected examples rather than retrieval-based selection; prior work cited by the authors shows retrieval-based ICL significantly outperforms random selection, making PEFT vs ICL comparisons potentially inflated." 425 }, 426 { 427 "flag": "Python-only evaluation", 428 "detail": "All experiments use Python code generation only, yet the abstract claims PEFT 'superiority and potential over ICL and RAG across a diverse set of LLMs' without qualifying this limitation up front." 429 } 430 ], 431 "cited_papers": [ 432 { 433 "title": "LoRA: Low-Rank Adaptation of Large Language Models", 434 "relevance": "Core PEFT technique evaluated; foundational method for parameter-efficient fine-tuning" 435 }, 436 { 437 "title": "QLoRA: Efficient Finetuning of Quantized LLMs", 438 "relevance": "QLoRA technique combining LoRA with quantization; key method evaluated for memory reduction" 439 }, 440 { 441 "title": "Code Llama: Open Foundation Models for Code", 442 "relevance": "Best-performing LLM family in the study; primary model used for RQ3 and RQ4 analysis" 443 }, 444 { 445 "title": "Few-Shot Parameter-Efficient Fine-Tuning is Better and Cheaper than In-Context Learning", 446 "relevance": "Prior NLP work showing PEFT advantage; this paper extends those findings to code generation with LLMs" 447 }, 448 { 449 "title": "Delta Tuning: A Comprehensive Study of Parameter Efficient Methods for Pre-Trained Language Models", 450 "relevance": "Large-scale NLP comparison showing full FT > PEFT; this paper's SE findings contrast with these results" 451 }, 452 { 453 "title": "Measuring Coding Challenge Competence With APPS", 454 "relevance": "Execution-based benchmark used for RQ4; provides difficulty-stratified evaluation of code generation" 455 }, 456 { 457 "title": "CodeT5+: Open Code Large Language Models for Code Understanding and Generation", 458 "relevance": "SLM and LLM family evaluated in the study; prior work on code-specific pre-training" 459 }, 460 { 461 "title": "Docprompting: Generating Code by Retrieving the Docs", 462 "relevance": "RAG baseline approach for code generation; directly compared against PEFT in RQ3" 463 } 464 ], 465 "engagement_factors": { 466 "practical_relevance": { 467 "score": 3, 468 "justification": "Directly addresses the real constraint of single-GPU fine-tuning, with specific memory numbers and code released for practitioners to reproduce." 469 }, 470 "surprise_contrarian": { 471 "score": 2, 472 "justification": "PEFT beating full fine-tuning for SLMs contrasts with NLP literature findings, and QLoRA-4bit outperforming LoRA is counterintuitive (lower precision = better)." 473 }, 474 "fear_safety": { 475 "score": 0, 476 "justification": "No AI safety or risk concerns raised; purely a methods comparison paper." 477 }, 478 "drama_conflict": { 479 "score": 0, 480 "justification": "Straightforward empirical comparison with no controversy or competing claims." 481 }, 482 "demo_ability": { 483 "score": 2, 484 "justification": "Code is publicly available at GitHub and all models are open-source; practitioners can reproduce results on a single consumer GPU." 485 }, 486 "brand_recognition": { 487 "score": 1, 488 "justification": "University of Montreal and Singapore Management University are solid academic institutions but not top-tier AI labs; no industry co-authorship." 489 } 490 }, 491 "hn_data": { 492 "threads": [ 493 { 494 "hn_id": "32632312", 495 "title": "Exploring the Role of the Cybercrime Underground in the Russia-Ukraine Conflict", 496 "points": 4, 497 "comments": 0, 498 "url": "https://news.ycombinator.com/item?id=32632312", 499 "created_at": "2022-08-28T21:36:55Z" 500 }, 501 { 502 "hn_id": "35662520", 503 "title": "Learning to Program with Natural Language", 504 "points": 3, 505 "comments": 2, 506 "url": "https://news.ycombinator.com/item?id=35662520", 507 "created_at": "2023-04-22T01:45:40Z" 508 }, 509 { 510 "hn_id": "37866902", 511 "title": "Getting Bored of Cyberwar", 512 "points": 3, 513 "comments": 1, 514 "url": "https://news.ycombinator.com/item?id=37866902", 515 "created_at": "2023-10-13T05:03:06Z" 516 }, 517 { 518 "hn_id": "37232173", 519 "title": "GPT-NER: Named Entity Recognition via Large Language Models", 520 "points": 3, 521 "comments": 0, 522 "url": "https://news.ycombinator.com/item?id=37232173", 523 "created_at": "2023-08-23T05:23:52Z" 524 }, 525 { 526 "hn_id": "37168933", 527 "title": "Fast as Chita: Neural Network Pruning with Combinatorial Optimization", 528 "points": 2, 529 "comments": 0, 530 "url": "https://news.ycombinator.com/item?id=37168933", 531 "created_at": "2023-08-17T22:16:16Z" 532 }, 533 { 534 "hn_id": "35984221", 535 "title": "SLiC-HF: Sequence Likelihood Calibration with Human Feedback", 536 "points": 2, 537 "comments": 0, 538 "url": "https://news.ycombinator.com/item?id=35984221", 539 "created_at": "2023-05-18T04:48:32Z" 540 }, 541 { 542 "hn_id": "35263649", 543 "title": "A comprehensive capacity analysis of GPT-3 and GPT-3.5 models", 544 "points": 2, 545 "comments": 0, 546 "url": "https://news.ycombinator.com/item?id=35263649", 547 "created_at": "2023-03-22T16:39:00Z" 548 }, 549 { 550 "hn_id": "37232871", 551 "title": "Vanilla Transformer SOTA for Traffic Forecasting [pdf]", 552 "points": 1, 553 "comments": 0, 554 "url": "https://news.ycombinator.com/item?id=37232871", 555 "created_at": "2023-08-23T07:33:46Z" 556 }, 557 { 558 "hn_id": "37958375", 559 "title": "Revealing the structure of language model capabilities", 560 "points": 1, 561 "comments": 0, 562 "url": "https://news.ycombinator.com/item?id=37958375", 563 "created_at": "2023-10-20T16:40:14Z" 564 }, 565 { 566 "hn_id": "35670419", 567 "title": "Fully Autonomous Programming with Large Language Models", 568 "points": 1, 569 "comments": 0, 570 "url": "https://news.ycombinator.com/item?id=35670419", 571 "created_at": "2023-04-22T20:05:33Z" 572 } 573 ], 574 "top_points": 4, 575 "total_points": 22, 576 "total_comments": 3 577 } 578 }