scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27550B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Exploring Parameter-Efficient Fine-Tuning Techniques for Code Generation with Large Language Models",
      6     "authors": [
      7       "M. Weyssow",
      8       "Xin Zhou",
      9       "Kisub Kim",
     10       "David Lo",
     11       "H. Sahraoui"
     12     ],
     13     "year": 2023,
     14     "venue": "ACM Transactions on Software Engineering and Methodology",
     15     "arxiv_id": "2308.10462",
     16     "doi": "10.1145/3714461"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims PEFT superiority over ICL/RAG and QLoRA memory reduction are directly supported by Tables 3-4 and Figures 5-7 showing EM@k and GPU memory results across all model families.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Claims like 'LoRA improves effectiveness' are supported by controlled comparisons holding models constant and varying technique across identical datasets and splits; the design is adequate for comparative causal claims.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The paper explicitly bounds claims to Python code generation, single-GPU resource constraint, and the specific model families tested; Threats to Validity (Section 7) explicitly flags the monolingual limitation and restricted model selection.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The main finding that PEFT beats ICL/RAG is not accompanied by discussion of alternative explanations (e.g., whether optimized ICL example selection would close the gap); only the QLoRA-4bit improvement mentions a hypothesis (regularization effect).",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper uses EM@k, CodeBLEU, and Pass@k as proxies for code generation quality and explicitly notes in Section 5.3 the distinction between EM (requiring exact match) and CodeBLEU (rewarding near-correct solutions), clarifying what each metric captures.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 7 'Threats to Validity' contains dedicated subsections for external, internal, and construct validity with multiple specific threats discussed.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats include: Python-only datasets limiting multilingual generalizability, hyperparameter choices based on prior work without sensitivity analysis, and EM@k not capturing execution correctness for Conala/CodeAlpacaPy.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly states it excludes closed-source models, excludes full fine-tuning for LLMs due to resource constraints, and notes that combining ICL/RAG with fine-tuned LLMs was not explored.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment or disclosure appears anywhere in the provided paper text.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All five authors' institutional affiliations (University of Montreal, Singapore Management University) are disclosed on the title page with email addresses.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Funding is not disclosed, making independence assessment impossible.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement appears in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "PEFT, ICL, RAG, LLM (≥1B parameters), and SLM (<1B parameters) are all explicitly defined in Sections 1-2 with precise parameter-count boundaries and technical descriptions.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 1 explicitly lists three contributions: comprehensive empirical study of 6 PEFT techniques for LLMs in code generation, comparison against ICL/RAG, and demonstration of practicality under limited resources.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 8 explicitly distinguishes this work from prior PEFT studies by noting they focused on SLMs (<0.25B parameters) and explicitly claims this is 'among the first comprehensive exploration of PEFT techniques for LLMs in software engineering.'",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Code is publicly available at https://github.com/martin-wey/peft-llm-code, mentioned in Section 4.6.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Conala, APPS, and CodeAlpaca are all publicly available datasets; CodeAlpacaPy is a filtered subset of CodeAlpaca described in sufficient detail to reproduce.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Only the GPU model (NVIDIA RTX A5000 24GB) and library names (HuggingFace, PEFT) are mentioned; no requirements.txt, Dockerfile, or versioned dependency list is provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions appear in the paper; the hyperparameters are listed but no runnable workflow or README-equivalent is described.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Tables 3-4 and Figures 3-7 are reported as single point estimates with no confidence intervals or error bars.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied to any comparative claims despite the paper making numerous 'X outperforms Y' conclusions across all four RQs.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Percentage improvements are reported with baselines (e.g., 'best LLM surpasses best small model by 39.8–72.3% in EM@k', 'QLoRA-4bit boosting average passed tests by 52%') providing effect size context.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Dataset sizes are described but no power analysis or justification for why 543/628/750 test examples are sufficient to detect the observed effect sizes is provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "All reported EM@k and CodeBLEU scores are single values with no standard deviation, variance, or multi-run averaging reported.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Zero-shot, ICL (random), RAG, and full fine-tuning for SLMs are all used as baselines against PEFT techniques.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "CodeLlama (2023), CodeGen2 (2023), and CodeT5+ (2023) are all recent model families; RAG uses GTE-small described as outperforming OpenAI embeddings.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "The systematic comparison across LoRA, IA3, Prompt tuning, Prefix tuning, QLoRA-8bit, and QLoRA-4bit effectively ablates the contribution of each PEFT design choice.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "EM@1, EM@10, CodeBLEU are used for Conala/CodeAlpacaPy; average test cases passed and Pass@k (k=1,2,5) are used for APPS.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "Automated code generation benchmarks with ground truth make human evaluation not clearly required for the claims made; the paper focuses on match-based and execution-based correctness.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "All three datasets have explicit train/validation/test splits; Conala 2135/201/543, CodeAlpacaPy 2192/314/628, APPS 4500/500/750.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Table 4 breaks APPS results into introductory, interview, and competition difficulty levels; model family breakdowns across SLMs and LLMs are provided throughout.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "The paper notes that improvements are 'less substantial for interview and competition-level tasks' and that Prefix tuning 'fails to effectively adapt larger models,' but no specific failure case examples are shown.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Negative results are clearly reported: Prefix tuning fails for larger LLMs, RAG underperforms ICL on complex CodeAlpacaPy, and PEFT gains are minimal for competition-level APPS problems.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Specific model variants are named: CodeGen-350M-mono, CodeT5+-220M/770M, CodeGen2-1B/3.7B/7B, CodeLlama-7B/7B-Instruct/7B-Python/13B-Python/34B-Python with exact parameter counts.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Table 2 shows the actual prompt template with '### Instruction:' and '### Response:' delimiters plus three concrete examples from each dataset.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Section 4.6 reports learning rates (5e-5 for full FT, 3e-4 for LoRA/IA3/QLoRA, 3e-3 for Prompt tuning, 3e-2 for Prefix tuning), LoRA rank r=16, alpha=32, 20 virtual tokens, batch size 8, 5 epochs, beam size 10.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "This is a fine-tuning/inference study with no agentic scaffolding; the question is not applicable.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "CodeAlpacaPy construction is described (filtering for Python, static parsing for syntactic validity); Conala curation is described (ensuring StackOverflow post separation across splits, function uniqueness).",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "All three datasets (Conala, APPS, CodeAlpaca) are publicly available; the filtered CodeAlpacaPy subset is derivable from the public CodeAlpaca dataset using the described procedure.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 4.2 describes each dataset's origin: Conala crawled from StackOverflow with manual annotation, APPS from competitive programming, CodeAlpacaPy filtered from CodeAlpaca for syntactically valid Python.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; all data from standard benchmarks and code repositories.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The full pipeline from dataset selection through train/val/test splitting, preprocessing, fine-tuning, and evaluation is described in Sections 4.2-4.6.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoff dates are stated for CodeLlama, CodeGen2, or CodeT5+ despite these models having known pre-training corpora that may overlap with benchmark datasets.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "Intra-dataset train/test overlap is addressed for Conala, but whether model pre-training data (TheStack, code data) contains the APPS, Conala, or CodeAlpaca test examples is not discussed.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "Conala (2018), APPS (2021), and CodeAlpaca (2023) were available before CodeLlama and CodeGen2 training cutoffs; this potential contamination is not addressed.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Peak GPU memory consumption during inference and fine-tuning is reported in Figure 1 for all model configurations, which is the primary resource constraint discussed.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "The entire study is explicitly conducted under a single NVIDIA RTX A5000 24GB GPU constraint, stated as the computational budget in Section 4.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "PEFT techniques (LoRA, IA3) consistently outperform ICL for LLMs on code generation",
    375       "evidence": "Figure 6 shows all models fine-tuned with LoRA achieve significantly higher EM@10 than their ICL counterparts on both Conala and CodeAlpacaPy; CodeLlama-7B-Python LoRA achieves 36.28 vs 29.47 ICL EM@10 on Conala (23.1% improvement)",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "LLMs fine-tuned with PEFT outperform SLMs fully fine-tuned by 39.8–72.3% in EM@k",
    380       "evidence": "Table 3 shows best LLM (CodeLlama-7B-Python with LoRA) vs best SLM (CodeGen-350M-mono with LoRA): 39.8–72.3% improvement in EM@k on Conala and CodeAlpacaPy under same 24GB GPU constraint",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "QLoRA-4bit reduces peak GPU memory up to 2x versus LoRA while maintaining effectiveness",
    385       "evidence": "Figure 1 shows CodeLlama-7B-Python: LoRA uses 19.06GB, QLoRA-4bit uses 9.16GB (2x reduction); Figure 5 shows QLoRA-4bit achieves 40.70 EM@10 vs LoRA's 36.28 on Conala for CodeLlama-34B",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "LoRA outperforms RAG for code generation on both datasets across all CodeLlama variants",
    390       "evidence": "Figure 7 shows CodeLlama-7B achieves 39.31 EM@10 with LoRA vs 35.17 with RAG (best) vs 29.83 with ICL on Conala; similar pattern holds for CodeAlpacaPy",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "PEFT outperforms full fine-tuning for SLMs, contrasting with NLP findings",
    395       "evidence": "Table 3 shows CodeGen-350M-mono LoRA achieves 25.60 EM@10 on Conala vs 18.42 for full fine-tuning; similar patterns for CodeT5+ variants. Authors note this contrasts with Ding et al.'s NLP finding that full fine-tuning is superior",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Prefix tuning fails to effectively adapt larger LLMs to code generation datasets",
    400       "evidence": "Table 3 shows Prefix tuning yields 0.0 EM@1 and 0.16–0.32 EM@10 on CodeAlpacaPy for CodeGen2-7B, CodeLlama variants, and all models ≥3.7B, while LoRA achieves 7–8% EM@1 on the same models",
    401       "supported": "strong"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "observational"
    407   ],
    408   "key_findings": "PEFT techniques, particularly LoRA, consistently outperform both ICL and RAG for Python code generation across 11 LLMs and SLMs tested under a single 24GB GPU constraint. LLMs fine-tuned with PEFT surpass fully fine-tuned SLMs by 39–72% in EM@k, and PEFT also beats full fine-tuning for SLMs (contrasting with NLP literature). QLoRA-4bit enables fine-tuning of 34B parameter models within a 24GB GPU while achieving comparable or superior performance to LoRA, and Prefix tuning consistently fails for models above 3.7B parameters. Benchmark contamination from model pre-training data is unaddressed, and no statistical significance tests are applied to any comparative claims.",
    409   "red_flags": [
    410     {
    411       "flag": "No statistical significance tests",
    412       "detail": "All comparative claims ('LoRA significantly enhances', 'consistently outperforms') are made without any statistical tests; single-run point estimates are reported throughout Tables 3-4."
    413     },
    414     {
    415       "flag": "No variance across runs",
    416       "detail": "No standard deviation or multi-run results are reported; fine-tuning with random initialization and dataset sampling introduces variance that is unmeasured."
    417     },
    418     {
    419       "flag": "Benchmark contamination unaddressed",
    420       "detail": "Conala (2018), APPS (2021), and CodeAlpaca (2023) predate the training cutoffs of CodeLlama and CodeGen2; potential test data leakage into model pre-training is never discussed."
    421     },
    422     {
    423       "flag": "ICL baseline potentially weak",
    424       "detail": "ICL uses randomly selected examples rather than retrieval-based selection; prior work cited by the authors shows retrieval-based ICL significantly outperforms random selection, making PEFT vs ICL comparisons potentially inflated."
    425     },
    426     {
    427       "flag": "Python-only evaluation",
    428       "detail": "All experiments use Python code generation only, yet the abstract claims PEFT 'superiority and potential over ICL and RAG across a diverse set of LLMs' without qualifying this limitation up front."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    434       "relevance": "Core PEFT technique evaluated; foundational method for parameter-efficient fine-tuning"
    435     },
    436     {
    437       "title": "QLoRA: Efficient Finetuning of Quantized LLMs",
    438       "relevance": "QLoRA technique combining LoRA with quantization; key method evaluated for memory reduction"
    439     },
    440     {
    441       "title": "Code Llama: Open Foundation Models for Code",
    442       "relevance": "Best-performing LLM family in the study; primary model used for RQ3 and RQ4 analysis"
    443     },
    444     {
    445       "title": "Few-Shot Parameter-Efficient Fine-Tuning is Better and Cheaper than In-Context Learning",
    446       "relevance": "Prior NLP work showing PEFT advantage; this paper extends those findings to code generation with LLMs"
    447     },
    448     {
    449       "title": "Delta Tuning: A Comprehensive Study of Parameter Efficient Methods for Pre-Trained Language Models",
    450       "relevance": "Large-scale NLP comparison showing full FT > PEFT; this paper's SE findings contrast with these results"
    451     },
    452     {
    453       "title": "Measuring Coding Challenge Competence With APPS",
    454       "relevance": "Execution-based benchmark used for RQ4; provides difficulty-stratified evaluation of code generation"
    455     },
    456     {
    457       "title": "CodeT5+: Open Code Large Language Models for Code Understanding and Generation",
    458       "relevance": "SLM and LLM family evaluated in the study; prior work on code-specific pre-training"
    459     },
    460     {
    461       "title": "Docprompting: Generating Code by Retrieving the Docs",
    462       "relevance": "RAG baseline approach for code generation; directly compared against PEFT in RQ3"
    463     }
    464   ],
    465   "engagement_factors": {
    466     "practical_relevance": {
    467       "score": 3,
    468       "justification": "Directly addresses the real constraint of single-GPU fine-tuning, with specific memory numbers and code released for practitioners to reproduce."
    469     },
    470     "surprise_contrarian": {
    471       "score": 2,
    472       "justification": "PEFT beating full fine-tuning for SLMs contrasts with NLP literature findings, and QLoRA-4bit outperforming LoRA is counterintuitive (lower precision = better)."
    473     },
    474     "fear_safety": {
    475       "score": 0,
    476       "justification": "No AI safety or risk concerns raised; purely a methods comparison paper."
    477     },
    478     "drama_conflict": {
    479       "score": 0,
    480       "justification": "Straightforward empirical comparison with no controversy or competing claims."
    481     },
    482     "demo_ability": {
    483       "score": 2,
    484       "justification": "Code is publicly available at GitHub and all models are open-source; practitioners can reproduce results on a single consumer GPU."
    485     },
    486     "brand_recognition": {
    487       "score": 1,
    488       "justification": "University of Montreal and Singapore Management University are solid academic institutions but not top-tier AI labs; no industry co-authorship."
    489     }
    490   },
    491   "hn_data": {
    492     "threads": [
    493       {
    494         "hn_id": "32632312",
    495         "title": "Exploring the Role of the Cybercrime Underground in the Russia-Ukraine Conflict",
    496         "points": 4,
    497         "comments": 0,
    498         "url": "https://news.ycombinator.com/item?id=32632312",
    499         "created_at": "2022-08-28T21:36:55Z"
    500       },
    501       {
    502         "hn_id": "35662520",
    503         "title": "Learning to Program with Natural Language",
    504         "points": 3,
    505         "comments": 2,
    506         "url": "https://news.ycombinator.com/item?id=35662520",
    507         "created_at": "2023-04-22T01:45:40Z"
    508       },
    509       {
    510         "hn_id": "37866902",
    511         "title": "Getting Bored of Cyberwar",
    512         "points": 3,
    513         "comments": 1,
    514         "url": "https://news.ycombinator.com/item?id=37866902",
    515         "created_at": "2023-10-13T05:03:06Z"
    516       },
    517       {
    518         "hn_id": "37232173",
    519         "title": "GPT-NER: Named Entity Recognition via Large Language Models",
    520         "points": 3,
    521         "comments": 0,
    522         "url": "https://news.ycombinator.com/item?id=37232173",
    523         "created_at": "2023-08-23T05:23:52Z"
    524       },
    525       {
    526         "hn_id": "37168933",
    527         "title": "Fast as Chita: Neural Network Pruning with Combinatorial Optimization",
    528         "points": 2,
    529         "comments": 0,
    530         "url": "https://news.ycombinator.com/item?id=37168933",
    531         "created_at": "2023-08-17T22:16:16Z"
    532       },
    533       {
    534         "hn_id": "35984221",
    535         "title": "SLiC-HF: Sequence Likelihood Calibration with Human Feedback",
    536         "points": 2,
    537         "comments": 0,
    538         "url": "https://news.ycombinator.com/item?id=35984221",
    539         "created_at": "2023-05-18T04:48:32Z"
    540       },
    541       {
    542         "hn_id": "35263649",
    543         "title": "A comprehensive capacity analysis of GPT-3 and GPT-3.5 models",
    544         "points": 2,
    545         "comments": 0,
    546         "url": "https://news.ycombinator.com/item?id=35263649",
    547         "created_at": "2023-03-22T16:39:00Z"
    548       },
    549       {
    550         "hn_id": "37232871",
    551         "title": "Vanilla Transformer SOTA for Traffic Forecasting [pdf]",
    552         "points": 1,
    553         "comments": 0,
    554         "url": "https://news.ycombinator.com/item?id=37232871",
    555         "created_at": "2023-08-23T07:33:46Z"
    556       },
    557       {
    558         "hn_id": "37958375",
    559         "title": "Revealing the structure of language model capabilities",
    560         "points": 1,
    561         "comments": 0,
    562         "url": "https://news.ycombinator.com/item?id=37958375",
    563         "created_at": "2023-10-20T16:40:14Z"
    564       },
    565       {
    566         "hn_id": "35670419",
    567         "title": "Fully Autonomous Programming with Large Language Models",
    568         "points": 1,
    569         "comments": 0,
    570         "url": "https://news.ycombinator.com/item?id=35670419",
    571         "created_at": "2023-04-22T20:05:33Z"
    572       }
    573     ],
    574     "top_points": 4,
    575     "total_points": 22,
    576     "total_comments": 3
    577   }
    578 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs