scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24315B)
      1 {
      2   "paper": {
      3     "title": "Instruction Tuning of Large Language Models for Tabular Data Generation—in One Day",
      4     "authors": [
      5       "Milad Abdollahzadeh",
      6       "Abdul Raheem",
      7       "Zilong Zhao",
      8       "Uzair Javaid",
      9       "Kevin Yee",
     10       "Nalam Venkata Abhishek",
     11       "Tram Truong-Huu",
     12       "Biplab Sikdar"
     13     ],
     14     "year": 2025,
     15     "venue": "ICML 2025 (PMLR 267)",
     16     "arxiv_id": "2511.23220",
     17     "doi": "10.48550/arXiv.2511.23220"
     18   },
     19   "scan_version": 2,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval"],
     22   "key_findings": "Instruction tuning Llama3.1-8B-Instruct on a small (7K) but high-quality instruction dataset for tabular data generation, using a single A100 GPU for under 6 hours, yields fidelity and utility metrics competitive with GPT-4o across 20 datasets. The base LLM without instruction tuning largely fails to follow tabular generation instructions (~80% of output is irrelevant). The approach generalizes to another base LLM (TableLlama) in supplementary experiments.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No repository URL, code archive, or link to released code is provided anywhere in the paper."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The source datasets are publicly available, but the authors' constructed instruction dataset (10K instructions with metadata) is not released. No download link is provided."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper mentions A100 80GB GPU, HuggingFace Transformers, and DeepSpeed ZeRO-2, but provides no requirements.txt, library versions, or environment specification sufficient to recreate the setup."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The method description in Sections 4-5 gives high-level steps but not enough detail to replicate without guessing."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Tables 1 and 2 report only point estimates for Shape, Trends, AUC, and R2. No confidence intervals, error bars, or ± notation appear anywhere."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper claims performance 'on par with GPT-4o' based solely on comparing numbers in tables without any statistical test."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Tables 1-2 report absolute metric values for base LLM, ITT-GEN, and GPT-4o, allowing the reader to assess the magnitude of improvement (e.g., breast cancer Shape: 55.31 → 84.12 → 78.65)."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No justification for the choice of 20 datasets, 500 training instances per dataset, N=20 rows per instance, or any other sample size decision."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No variance, standard deviation, or spread measure is reported across experimental runs. Results appear to be single-run."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper compares against two baselines: the base LLM (Llama3.1-8B-Instruct) and GPT-4o (Section 5.1)."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "GPT-4o is a contemporary and capable commercial LLM. The comparison is reasonable for the time of writing."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No ablation study is performed. The system includes metadata design, instruction format, and fine-tuning, but no experiment removes individual components (e.g., effect of metadata, effect of instruction format, effect of dataset size)."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The paper uses fidelity metrics (Shape, Trends) and utility metrics (AUC, R2 via TSTR framework with three ML models) as described in Section 5.1."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No human evaluation of generated tabular data quality. All evaluation is automated via fidelity and utility metrics."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "6 of 20 datasets are held out as unseen out-of-domain (OoD) evaluation datasets, separate from the 14 training datasets (Section 4.1, Table 3)."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Tables 1 and 2 provide per-dataset results across all 20 datasets rather than just aggregate averages."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "The paper discusses base LLM failure (generating irrelevant text, Figure 1) but does not analyze when or why ITT-GEN itself fails. Some datasets show notably poor performance (e.g., job posting, Players2024) without discussion."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "Every experiment shows ITT-GEN performing well. No failed approaches, abandoned configurations, or ablations that hurt performance are reported."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The abstract claims 'performance on par with GPT-4o.' However, Tables 1-2 show GPT-4o substantially outperforms ITT-GEN on many datasets (e.g., adult Shape 85.73 vs 92.34; california housing Shape 73.29 vs 96.27; bank utility 0.616 vs 0.820). 'On par' overstates the results."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The main causal claim — that instruction tuning improves tabular data generation — is supported by comparing the same base model before and after fine-tuning, which is a controlled single-variable manipulation."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title says 'Large Language Models' (plural) but the main experiments test only Llama3.1-8B-Instruct. TableLlama results in the appendix use a weaker, older model. Claims of general LLM applicability are not bounded to the tested setting."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "No alternative explanations are discussed. For example, the improvement could partly come from the model memorizing the 20 dataset formats rather than learning general tabular generation, or the metadata could be doing most of the work."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper measures Shape (distributional similarity), Trends (correlation preservation), and TSTR utility, and frames these as tabular data generation quality metrics. The claims match the granularity of the measurements without overclaiming to a broader construct."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Llama3.1-8B-Instruct is specific, but GPT-4o is used without a snapshot date or API version. Per schema, marketing names without snapshot dates do not count."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Figure 2 shows a complete example instruction including the full prompt structure with metadata, input table, question, and expected answer. Figure 3 shows the GPT-4o prompt template for metadata generation."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 5.1 reports: learning rate 2e-5, batch size 3, 2 epochs, A100 80GB GPU, DeepSpeed ZeRO-2 stage."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "No agentic scaffolding is used. The approach is straightforward fine-tuning and inference."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 4.1 documents the dataset construction pipeline: 20 public datasets sampled across 10 topics, 14/6 train/eval split, 500/100 instances per dataset, N=20 rows per instance, metadata generated via GPT-4o with manual review."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "Section 1 has a 'Limitations' paragraph about the research gap (high data/compute requirements), not about limitations of the authors' own method. No dedicated limitations section discusses the shortcomings of ITT-GEN."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No threats to validity are discussed. No acknowledgment of potential issues with the evaluation, dataset selection, or generalizability of results."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "No explicit scope boundaries are stated. The paper does not discuss what settings, models, or data types the results do not apply to."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "The constructed instruction dataset is not released. Source datasets are public, but the processed instruction data and generated outputs are not available for verification."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 4.1 describes data collection: 20 public datasets from 10 topics, instruction construction procedure, metadata generation via GPT-4o, and manual review process."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No human participants. Data sources are standard public datasets listed in Table 3."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "The pipeline from dataset selection through instruction creation to training is documented in Section 4.1: dataset sampling → metadata generation → instruction construction → train/eval split → mixing and shuffling."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding or acknowledgments section is present in the paper."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations are listed: Singapore Institute of Technology, Betterdata AI (a synthetic data company), and National University of Singapore."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No funding information is disclosed, making independence unverifiable. Betterdata AI is a synthetic data company with direct commercial interest in the outcome."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial disclosure statement. Multiple authors are from Betterdata AI, a company whose product is synthetic data generation — directly relevant to the paper's findings."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No training data cutoff is stated for GPT-4o or Llama3.1-8B-Instruct. The public datasets used (e.g., adult, iris, boston housing) are decades old and almost certainly in the training data of both models."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No discussion of whether the public evaluation datasets (or similar tables) appeared in the pre-training data of GPT-4o or Llama3.1."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "Classic ML datasets like iris, adult, boston housing, and diabetes have been on the internet for decades and are almost certainly in LLM training data. This is not discussed."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in this study."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No inference cost, latency, or tokens consumed per generation is reported. GPT-4o API costs for the comparison are not mentioned."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": true,
    298         "justification": "Section 5.1 states training used 'an A100 80GB GPU for 2 epochs' and the abstract/conclusion state 'less than 6 hours' of training time."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from a single training run."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The number of experimental runs is not stated. It is unclear whether results are from one run or averaged over multiple."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No hyperparameter search is described. The chosen hyperparameters (lr=2e-5, batch=3, epochs=2) appear without justification or search budget."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "No justification for why the reported configuration was selected. No mention of validation-based selection or alternatives tried."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors evaluate their own system (ITT-GEN) against baselines without acknowledging author-evaluation bias."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "ITT-GEN uses a single A100 for 6 hours while GPT-4o is a massively larger model. This compute disparity is framed as a positive ('limited resources') but performance as a function of compute is not analyzed."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "No discussion of whether Shape, Trends, and TSTR metrics actually capture the quality of tabular data generation. The metrics are adopted from prior work without questioning their validity."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": false,
    344         "answer": false,
    345         "justification": "No scaffolding is involved in the approach. Models are evaluated directly."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "Not discussed. Classic datasets like iris (1936), adult (1994), boston housing (1978) have been on the internet for decades and are almost certainly in the training data of both LLMs."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "Not discussed. The evaluation provides 20 input rows and asks for 20 generated rows — it is unclear whether models could be memorizing and regurgitating known rows."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "Not discussed. Training and evaluation instances are constructed from the same datasets (different row samples), and the public datasets may be in the LLMs' pre-training data."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No leakage detection or prevention method is applied."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Instruction tuning on 7K instructions with a single A100 GPU for less than 6 hours yields tabular data generation performance on par with GPT-4o.",
    374       "evidence": "Tables 1-2 show fidelity (Shape, Trends) and utility (AUC, R2) metrics. ITT-GEN is competitive on some datasets but substantially lower on others (e.g., california housing Shape 73.29 vs 96.27, bank utility 0.616 vs 0.820).",
    375       "supported": "weak"
    376     },
    377     {
    378       "claim": "The base LLM (Llama3.1-8B-Instruct) fails to follow tabular data generation instructions without instruction tuning.",
    379       "evidence": "Figure 1 shows the base LLM generating irrelevant text instructions instead of tabular data. Section 5.2 notes ~80% of base LLM output is non-tabular.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "The approach is model-agnostic and also improves TableLlama's tabular data generation performance.",
    384       "evidence": "Supplementary Tables 4-5 show TableLlama base cannot generate structured output at all, while ITT-GEN fine-tuned TableLlama produces usable tabular data, though with a large gap to GPT-4o.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "This is the first work to explore instruction tuning for tabular data generation.",
    389       "evidence": "Section 1 and related work (Section 2) state that prior tabular instruction tuning focused on QA and reasoning tasks only.",
    390       "supported": "moderate"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "Overclaiming: 'on par with GPT-4o'",
    396       "detail": "Tables 1-2 show GPT-4o substantially outperforms ITT-GEN on many datasets. For example, california housing Shape (73.29 vs 96.27), adult Shape (85.73 vs 92.34), bank utility (0.616 vs 0.820). Claiming 'on par' is generous."
    397     },
    398     {
    399       "flag": "No error bars or variance across runs",
    400       "detail": "All results appear to be single-run point estimates with no uncertainty quantification. Given that fine-tuning with different random seeds can produce meaningfully different results, this is a significant omission."
    401     },
    402     {
    403       "flag": "Undisclosed conflict of interest",
    404       "detail": "Multiple authors (4 of 8) are from Betterdata AI, a commercial synthetic data company. The paper demonstrates that limited compute can produce competitive synthetic tabular data — directly relevant to the company's business. No competing interests statement is provided."
    405     },
    406     {
    407       "flag": "Contamination risk with classic datasets",
    408       "detail": "Evaluation uses well-known datasets (iris, adult, boston housing, diabetes) that are ubiquitous on the internet and almost certainly in both LLMs' training data. GPT-4o may have memorized these datasets. This is not discussed."
    409     },
    410     {
    411       "flag": "No ablation study",
    412       "detail": "The instruction dataset includes carefully designed metadata, but no ablation measures the contribution of metadata vs. instruction format vs. dataset diversity. It is unclear what drives the improvement."
    413     },
    414     {
    415       "flag": "Base LLM comparison is unfair",
    416       "detail": "Fidelity metrics for the base LLM are computed on only ~20% of its output (the portion that happens to be tabular), making the base LLM look more competitive than it actually is for fidelity but masking the real failure mode."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "TableLlama: Towards Open Large Generalist Models for Tables",
    422       "authors": ["Tianshu Zhang", "Xiang Yue", "Yifei Li", "Huan Sun"],
    423       "year": 2024,
    424       "relevance": "State-of-the-art open-source LLM for table understanding tasks, used as baseline in this paper's supplementary experiments."
    425     },
    426     {
    427       "title": "TableLLM: Enabling Tabular Data Manipulation by LLMs in Real Office Usage Scenarios",
    428       "authors": ["Xiaojuan Zhang", "Shuo Luo", "Bowen Zhang"],
    429       "year": 2024,
    430       "arxiv_id": "2403.19318",
    431       "relevance": "Tabular instruction tuning for LLMs on operations like QA and Pandas code generation."
    432     },
    433     {
    434       "title": "Rethinking Table Instruction Tuning",
    435       "authors": ["Naihao Deng", "Rada Mihalcea"],
    436       "year": 2025,
    437       "arxiv_id": "2501.14693",
    438       "relevance": "Analyzes hyperparameter selection impact on efficient tabular instruction tuning."
    439     },
    440     {
    441       "title": "HARMONIC: Harnessing LLMs for Tabular Data Synthesis and Privacy Protection",
    442       "authors": ["Yuxin Wang"],
    443       "year": 2024,
    444       "arxiv_id": "2408.02927",
    445       "relevance": "LLM-based tabular data generation via text conversion and fine-tuning."
    446     },
    447     {
    448       "title": "The Llama 3 Herd of Models",
    449       "authors": ["Aaron Grattafiori"],
    450       "year": 2024,
    451       "arxiv_id": "2407.21783",
    452       "relevance": "Base model family used in this study (Llama3.1-8B-Instruct)."
    453     },
    454     {
    455       "title": "GPT-4o System Card",
    456       "authors": ["Aaron Hurst"],
    457       "year": 2024,
    458       "arxiv_id": "2410.21276",
    459       "relevance": "Commercial LLM used as primary comparison baseline."
    460     },
    461     {
    462       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    463       "authors": ["Rafael Rafailov"],
    464       "year": 2023,
    465       "relevance": "DPO method used in post-training of the base Llama model."
    466     },
    467     {
    468       "title": "Instruction Tuning for Large Language Models: A Survey",
    469       "authors": ["Shengyu Zhang"],
    470       "year": 2023,
    471       "arxiv_id": "2308.10792",
    472       "relevance": "Survey of instruction tuning methods for LLMs."
    473     },
    474     {
    475       "title": "Finetuned Language Models Are Zero-Shot Learners",
    476       "authors": ["Jason Wei"],
    477       "year": 2022,
    478       "arxiv_id": "2109.01652",
    479       "relevance": "Foundational work on instruction tuning improving LLM task generalization."
    480     }
    481   ]
    482 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs