scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (33318B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "An efficient strategy for fine-tuning large language models",
      6     "authors": [
      7       "B. Marsh",
      8       "Adam Michaleas",
      9       "Darrell O. Ricke",
     10       "Shaun Monera",
     11       "Shriya Zembruski"
     12     ],
     13     "year": 2026,
     14     "venue": "Frontiers in Artificial Intelligence",
     15     "arxiv_id": null,
     16     "doi": "10.3389/frai.2026.1665992"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims are supported: DSS + full-precision strongest (Table 3, loss 0.06384), LoRA provides performance-efficiency tradeoff (Table 3, Section 5.1), QLoRA enables larger models under memory constraints (only method to run FLAN-T5 XL), 4:1 alpha-to-rank ratio (Figure 7, Section 5.2).",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The primary causal claim — that DSS rationales improve training — is supported by a controlled ablation (Section 3.7, Table 4) that manipulates only α (0.5 vs 1.0) while holding all other hyperparameters constant. This single-variable manipulation is adequate for causal inference within the tested setting.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title claims 'fine-tuning large language models' broadly, and the abstract proposes 'a general guide for efficiently fine-tuning LLMs for domain-specific tasks.' However, the study tests only one task (NL to Query DSL) with one model family (FLAN-T5 encoder-decoder). The Limitations section acknowledges this but the title and abstract still overclaim.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section 4 discusses alternative explanations for the counter-intuitive memory findings: additional adapter overhead, dequantization costs during forward/backward passes, and 'implementation differences between the full-precision and LoRA/QLoRA methods' (using standard PyTorch vs PEFT/bitsandbytes libraries).",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Section 5.5 explicitly acknowledges: 'the metrics do not directly capture task-level correctness, such as exact match rates on the DSL JSON' and recommends 'future work should incorporate additional metrics, such as BLEU, METEOR, and TER scores.' This directly addresses the gap between measured proxy (token-level loss) and claimed outcome (task effectiveness).",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 5.5 is titled 'Limitations' and contains two substantial paragraphs discussing multiple specific limitations of the study.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section 5.5 lists specific threats: single downstream task (NL to Query DSL), token-level loss without task-level metrics, FLAN-T5 only without decoder-only architectures, limited random seeds, and incomplete hyperparameter exploration due to compute constraints.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Section 5.5 explicitly states: 'conclusions may not directly transfer to other domains, such as open-ended text generation, conversational dialogue, or classification tasks,' and 'the methodology focuses on the FLAN-T5 encoder-decoder family and does not include decoder-only architectures.'",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Funding section states: 'This material is based upon work supported by the Department of the Air Force under Air Force Contract No. FA8702-15-D-0001.'",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly stated: Marine Corps Tactical Systems Support Activity (USMC) and MIT Lincoln Laboratory, Artificial Intelligence Technology. No conflict with evaluated products exists.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The funder (US Department of the Air Force) has no commercial interest in any of the evaluated methods or models (FLAN-T5, LoRA, QLoRA are open-source methods from academic/industry research). The funder benefits from knowing which method works, not from a specific outcome.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The Conflict of Interest section states: 'The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.'",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "DSS, LoRA, QLoRA, and full-precision fine-tuning are all defined with technical detail including mathematical formulations (Equations 1–3).",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper clearly states its contribution: 'an end-to-end strategy for rapidly fine-tuning LLMs for domain-specific tasks when both data and compute are limited,' operationalized as DSS + benchmarked fine-tuning methods.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 covers domain adaptation, PEFT methods, DSS/CoT, and NL-to-SQL work; Section 2.5 explicitly positions this work relative to Hsieh et al., Hu et al., and Dettmers et al.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "A public GitHub repository is provided: https://github.com/brmarsh23/An-Efficient-Strategy-for-Fine-Tuning-Large-Language-Models. The paper states 'The code and instructions are available at the following Git Repository link' (Section 1).",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The Data Availability Statement explicitly states: 'The datasets presented in this article are not readily available because dataset utilized in the submission is Controlled Unclassified Information (CUI) from US Department of Defense computer information systems.'",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Section 3.5 describes hardware (Intel Xeon Platinum 8480+, NVIDIA H100 GPUs, 2 TB RAM) and names libraries (PyTorch, PEFT, bitsandbytes, Ray Train) but provides no library versions, requirements.txt, or Dockerfile in the paper itself.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The paper itself contains no step-by-step reproduction instructions. A GitHub URL is provided with claimed 'instructions,' but the dataset is CUI-restricted, making full reproduction impossible regardless of code availability.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Tables 3 and 4 report point estimates of evaluation loss with no confidence intervals, error bars, or uncertainty measures. The ablation (Table 4) is averaged over 2 seeds but reports no CI or spread.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper claims full-precision 'yields the strongest overall performance' and DSS 'consistently improves model fine-tuning' but provides no statistical significance tests (no p-values, t-tests, or bootstrap tests) to support any comparative claims.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "Table 4 shows raw loss differences between DSS and label-only training (e.g., +1.4e-3) but no standardized effect sizes (Cohen's d, percentage improvement, or relative differences). Main results in Table 3 are raw loss values with no effect size framing.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The dataset consists of 1,000 questions with no justification for this sample size, no power analysis, and no discussion of whether 1,000 examples is sufficient for the claims being made.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Main results (Table 3) are single-run best evaluation losses. The ablation (Table 4) is 'averaged over two random seeds' but reports no standard deviation, IQR, or spread measure. The reader cannot assess result stability.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Three fine-tuning methods are systematically compared: full-precision, LoRA, and QLoRA. The ablation additionally compares DSS (α=0.5) against label-only (α=1.0) training.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "LoRA (Hu et al., 2021) and QLoRA (Dettmers et al., 2023) are the current standard parameter-efficient fine-tuning methods. DSS (Hsieh et al., 2023) is recent. All are appropriate contemporary methods.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Section 3.7 describes a dedicated ablation study comparing DSS rationale-augmented training (α=0.5) versus label-only training (α=1.0) 'across model sizes and fine-tuning methods.' Results in Table 4.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "The paper reports evaluation loss for task performance, plus GPU memory usage, training samples per second, and total training time as efficiency metrics (Figure 6, Table 3). Multiple dimensions of comparison are provided.",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "No human evaluation of model outputs is performed. The paper fine-tunes models for NL-to-Query DSL translation but never has humans assess whether the generated DSL queries are correct, usable, or semantically appropriate.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": false,
    212           "justification": "The 80/20 split produces an 'evaluation dataset' that is used both for learning rate reduction decisions ('evaluation loss was monitored in order to conduct learning rate reduction after 10 epochs of no improvement,' Section 3.5) and for final performance reporting. This makes it a validation set, not a held-out test set.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by model architecture (Small/Base/Large/XL), fine-tuning method (full-precision/LoRA/QLoRA), and hyperparameter settings (Rank/Alpha combinations). Figure 6 provides a quad chart; Table 3 lists top 8 models; Figure 7 breaks down by Rank/Alpha.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "No qualitative error analysis of model outputs is provided. The paper does not show examples where the fine-tuned model produced incorrect Query DSL or discuss specific failure modes of the generated outputs.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The paper reports the counter-intuitive finding that 'LoRA and QLoRA methods required significantly more memory to train the highest-performing model type, the FLAN-T5 Large, than the full-precision method' (Section 4), contradicting theoretical expectations.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Specific model architectures are identified: FLAN-T5 Small (76.9M), Base (247.6M), Large (770.6M), and XL (2,884.5M) with detailed architecture parameters in Table 1. Teacher model specified as Mixtral 8x22B. These are well-defined open-source models.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Figure 2 shows an example input prompt with its three-part structure: DSL interface instructions, dataset description, and Chain-of-Thought prompting. Figure 3 shows the dataset creation process. Figure 4 shows the training step format with task prefixes.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Table 2 reports hyperparameters: learning rate (5e-5), patience (10 epochs), LR factor (1e-1), epochs (100), batch size (8), alpha (0.5). Section 3.6 details LoRA/QLoRA-specific parameters: Rank values (32, 64, 128), Alpha values, target modules, and dropout rate.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used. The work involves standard model fine-tuning with DSS, not agentic workflows.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Sections 3.1-3.2 describe the data pipeline: 1,000 NL questions processed through Mixtral 8x22B with chain-of-thought prompting to generate labels and rationales, then formatted as multi-task training data with task prefixes, and split 80/20 for training and evaluation.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The dataset is classified as Controlled Unclassified Information (CUI) and 'not readily available.' Requests must be directed to benjamin.marsh@usmc.mil. Independent verification of the underlying data is not possible.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 3.2 describes: 1,000 NL questions designed for querying organizational data, translated to Query DSL using Mixtral 8x22B as teacher model via chain-of-thought prompting. The prompt structure is shown in Figure 2.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": false,
    282           "justification": "The origin of the 1,000 input natural language questions is never described. Were they hand-crafted, sampled from query logs, or synthetically generated? The paper jumps from describing the task to describing the teacher model output without explaining where the input questions came from.",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The pipeline from input questions through teacher-model generation to fine-tuning format is documented across Sections 3.1-3.2 and Figures 3-4. The paper mentions 'all training examples were screened for correctness and validity' (Section 5.5), though screening criteria are not detailed.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. Models are fine-tuned on a custom dataset (NL to Query DSL) created by the authors; contamination from pre-training data is not a relevant concern for this evaluation setup.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": false,
    301           "answer": false,
    302           "justification": "Same rationale: the evaluation dataset was generated by the authors specifically for this study, not drawn from a public benchmark. There is no pre-trained model benchmark evaluation scenario.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "No public benchmark is used. The evaluation is on a custom CUI dataset, so standard benchmark contamination concerns do not apply.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants. This is a model fine-tuning study with automated evaluation.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in the study.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in the study.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in the study.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in the study.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in the study.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in the study.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference cost, latency, or per-example cost is reported. The paper focuses entirely on training costs. How much it costs to run the fine-tuned models at inference time is not discussed.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Total compute time is stated: 'The total compute time for the hyperparameter search was 499.6 hours' (Section 3.6). Hardware is detailed: two-node cluster with four NVIDIA H100 80GB GPUs per node. Per-run training times are in Table 3.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "Main hyperparameter search results (Table 3) are single-run best evaluation losses with no seed variation. The ablation (Table 4) averages over only 2 random seeds with no spread reported. The paper acknowledges 'ablation results are averaged over a limited number of random seeds' (Section 5.5).",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": true,
    380           "justification": "Section 3.6 explicitly states: '3 runs were performed using full-precision fine-tuning, 39 runs were performed with the LoRA fine-tuning method, and 44 runs with the QLoRA fine-tuning method.' Total of 86 hyperparameter sweeps. Ablation states 'averaged over two random seeds.'",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": true,
    386           "justification": "Comprehensive search budget reported: 86 total sweeps, 499.6 compute hours, with explicit enumeration of Rank values (32, 64, 128), Alpha values per Rank, and per-architecture/per-method run counts (Section 3.6).",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": true,
    392           "justification": "Section 3.8 states: 'the best evaluation loss achieved for each hyperparameter sweep was used to perform the final comparison.' Selection criterion is clear and all top 8 configurations are reported in Table 3 rather than just the single best.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "86 hyperparameter configurations are compared with no correction for multiple comparisons. Claims of 'best' and 'consistent' patterns are made without adjusting for the number of comparisons.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The paper does not discuss author-evaluation bias. While they use standard library implementations rather than custom re-implementations, they do not acknowledge that implementation choices (e.g., which PyTorch vs PEFT optimizations are used) could systematically advantage one method.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": true,
    410           "justification": "Figure 6 explicitly plots performance (evaluation loss) alongside GPU memory usage, training samples per second, and total training time for each method and model size. Table 3 also pairs loss with GPU usage and training time. This is a central contribution of the paper.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": true,
    416           "justification": "Section 5.5 explicitly discusses construct validity: 'the metrics do not directly capture task-level correctness, such as exact match rates on the DSL JSON' and recommends additional metrics (BLEU, METEOR, TER). This directly questions whether the measured metric (token loss) captures what matters (task correctness).",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": false,
    421           "answer": false,
    422           "justification": "No scaffolding is involved. The study compares fine-tuning methods directly, not scaffold-dependent systems.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "No discussion of whether the FLAN-T5 pre-training data or Mixtral 8x22B training data could contain Query DSL patterns or similar structured generation examples that would advantage the models on this task.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "No discussion of whether the DSS multi-task training setup (where models learn to generate both rationales and labels) could leak information between tasks at evaluation time, or whether the teacher-generated labels could embed patterns from the teacher's training data.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "The 80/20 train-eval split is applied to 1,000 questions without discussion of whether train and eval examples share structural patterns (e.g., similar question templates, overlapping Query DSL structures) that would inflate performance estimates.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "No leakage detection or prevention method is used. No overlap analysis, deduplication, or independence verification between train and evaluation splits.",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "DSS combined with full-precision fine-tuning achieves the best evaluation loss (0.06384) among all evaluated configurations",
    457       "evidence": "Table 3 ranks top 8 models by evaluation loss; FLAN-T5 Large full-precision is first; Figure 6 shows full-precision consistently lowest average loss across all model sizes",
    458       "supported": "strong"
    459     },
    460     {
    461       "claim": "Alpha-to-rank ratio of 4:1 consistently provides optimal performance for LoRA and QLoRA fine-tuning",
    462       "evidence": "Figure 7 shows peak average performance at the highest alpha values (4x rank), and the majority of top models in Table 3 show 4:1 or higher ratios",
    463       "supported": "moderate"
    464     },
    465     {
    466       "claim": "DSS rationale supervision improves performance over label-only training in all 8 ablation configurations",
    467       "evidence": "Table 4 shows lower evaluation loss for alpha=0.5 vs alpha=1.0 across all combinations of model size and fine-tuning method",
    468       "supported": "moderate"
    469     },
    470     {
    471       "claim": "LoRA and QLoRA methods can require more GPU memory than full-precision fine-tuning for larger models (FLAN-T5 Large)",
    472       "evidence": "Figure 6 bottom-left shows LoRA and QLoRA averaging higher GPU memory than full-precision for FLAN-T5 Large, attributed to adapter overhead and dequantization requirements",
    473       "supported": "strong"
    474     },
    475     {
    476       "claim": "QLoRA enables fine-tuning of FLAN-T5 XL within available memory constraints when full-precision and LoRA cannot",
    477       "evidence": "All 5 FLAN-T5 XL runs used QLoRA due to GPU memory limitations; full-precision and LoRA failed on this architecture",
    478       "supported": "strong"
    479     },
    480     {
    481       "claim": "DSS rationale benefit is largest for smaller, more constrained models",
    482       "evidence": "Table 4 shows the largest loss improvement for FLAN-T5 Small QLoRA (+1.6e-2) and smallest for FLAN-T5 Base LoRA (+2.5e-4); Section 5.3 attributes this to structured supervision compensating for limited model capacity",
    483       "supported": "moderate"
    484     }
    485   ],
    486   "methodology_tags": [
    487     "benchmark-eval"
    488   ],
    489   "key_findings": "This paper proposes and evaluates an end-to-end LLM fine-tuning strategy combining Distilling Step-by-Step (DSS) dataset generation with three fine-tuning modalities on a NL-to-Query-DSL task. Full-precision fine-tuning with DSS achieves best performance (loss 0.06384) but is memory-limited for large architectures. LoRA and QLoRA provide competitive alternatives at different resource tradeoffs, with QLoRA being the only method viable for FLAN-T5 XL. A 4:1 alpha-to-rank ratio consistently yields best performance for parameter-efficient methods, and DSS rationale supervision improves over label-only training in all 8 ablation conditions — with the largest benefit for smaller, more constrained models.",
    490   "red_flags": [
    491     {
    492       "flag": "CUI data unavailable",
    493       "detail": "The fine-tuning dataset is Controlled Unclassified Information from DoD systems and cannot be accessed by outside researchers, making reproduction impossible despite code being released."
    494     },
    495     {
    496       "flag": "Evaluation loss as sole performance metric",
    497       "detail": "All performance comparisons rely on token-level cross-entropy loss with no task-level correctness metrics (exact match, BLEU, METEOR, TER); the paper itself acknowledges this as a key limitation."
    498     },
    499     {
    500       "flag": "Only two random seeds for ablation",
    501       "detail": "Ablation results are averaged over just 2 random seeds with no variance reported, providing insufficient characterization of result stability."
    502     },
    503     {
    504       "flag": "Origin of 1000 input questions undocumented",
    505       "detail": "The paper never documents how the 1000 natural language seed questions were created or collected, leaving the dataset provenance unclear."
    506     },
    507     {
    508       "flag": "Single task domain extrapolation",
    509       "detail": "All results derive from one NL-to-Query-DSL task in a specific DoD context; actionable recommendations (4:1 ratio, method selection guide) are generalized beyond this narrow evidence base."
    510     },
    511     {
    512       "flag": "No statistical significance testing",
    513       "detail": "Comparisons between fine-tuning methods are made by direct numerical comparison of loss values without hypothesis testing, leaving it unclear whether differences are statistically meaningful."
    514     }
    515   ],
    516   "cited_papers": [
    517     {
    518       "title": "Distilling Step-by-Step! Outperforming Larger Language Models with Less Training Data and Smaller Model Sizes",
    519       "relevance": "Core method used for dataset generation; paper extends DSS to FLAN-T5 with PEFT methods"
    520     },
    521     {
    522       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    523       "relevance": "One of the three fine-tuning methods benchmarked; foundational PEFT approach"
    524     },
    525     {
    526       "title": "QLoRA: Efficient Finetuning of Quantized LLMs",
    527       "relevance": "Second PEFT method benchmarked; enables largest model fine-tuning under memory constraints"
    528     },
    529     {
    530       "title": "Finetuned Language Models are Zero-Shot Learners (FLAN/FLAN-T5)",
    531       "relevance": "Student model architecture used throughout all experiments"
    532     },
    533     {
    534       "title": "Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer (T5)",
    535       "relevance": "Foundation architecture for FLAN-T5 models used in experiments"
    536     },
    537     {
    538       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    539       "relevance": "Technique used to elicit rationales from teacher model in the DSS pipeline"
    540     },
    541     {
    542       "title": "Towards a Unified View of Parameter-Efficient Transfer Learning",
    543       "relevance": "Survey contextualizing PEFT methods and explaining unexpected memory behavior for larger models"
    544     },
    545     {
    546       "title": "Parameter-Efficient Fine-Tuning for Large Models: A Comprehensive Survey",
    547       "relevance": "Background survey on PEFT landscape and performance-efficiency tradeoffs"
    548     }
    549   ],
    550   "engagement_factors": {
    551     "practical_relevance": {
    552       "score": 3,
    553       "justification": "Directly addresses resource-constrained LLM deployment with actionable decision framework (method selection guide, 4:1 ratio recommendation) and released code."
    554     },
    555     "surprise_contrarian": {
    556       "score": 1,
    557       "justification": "The finding that LoRA/QLoRA consume MORE memory than full-precision for FLAN-T5 Large contradicts the standard narrative of PEFT methods as uniformly memory-efficient."
    558     },
    559     "fear_safety": {
    560       "score": 0,
    561       "justification": "No safety or risk implications; purely about fine-tuning efficiency for domain adaptation."
    562     },
    563     "drama_conflict": {
    564       "score": 0,
    565       "justification": "No controversy or competing claims; paper presents results straightforwardly within established literature."
    566     },
    567     "demo_ability": {
    568       "score": 2,
    569       "justification": "Code is released on GitHub and can be run on suitable hardware, though the original DoD dataset cannot be replicated."
    570     },
    571     "brand_recognition": {
    572       "score": 1,
    573       "justification": "MIT Lincoln Laboratory affiliation adds credibility; US Marines/DoD angle is distinctive but not a mainstream AI lab brand."
    574     }
    575   },
    576   "hn_data": {
    577     "threads": [],
    578     "top_points": 0,
    579     "total_points": 0,
    580     "total_comments": 0
    581   }
    582 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs