scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25680B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Inference time LLM alignment in single and multidomain preference spectrum",
      6     "authors": [
      7       "Sadat Shahriar",
      8       "Zheng Qi",
      9       "Nikolaos Pappas",
     10       "Srikanth Doss Kadarundalagi Raghuram Doss",
     11       "Monica Sunkara"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2410.19206",
     16     "doi": "10.48550/arXiv.2410.19206"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The abstract's 'reducing inference cost by half' is argued theoretically from prompt token doubling but never empirically measured; the '12x faster' claim rests on rough per-job time estimates rather than systematic benchmarking.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The systematic lambda sweep (varying from -1.4 to 1.0 in 0.1 increments) with isolated single-variable control provides adequate support for causal claims about AV's effect on proficiency levels.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper title and abstract frame this as general 'LLM alignment' but all experiments use only Mistral-7B-Instruct-v0.3; the limitations section acknowledges this but the main claims are not consistently bounded to single-architecture settings.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not consider alternative explanations for why AVs generalize across domains (e.g., capturing response length/verbosity rather than domain expertise), nor does it explain why 5/27 multidomain combinations fail.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The primary metric 'preference accuracy' is a log-probability comparison, not actual user preference; GPT-4 judged accuracy is also a proxy. The paper does not discuss whether these proxies validly represent real-world alignment quality.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 7 'Limitations and Future Work' is a dedicated section listing four specific limitations with directions for future research.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Limitations are specific: method works only for same-architecture LLMs, tested only on Mistral-7b, requires extensive grid search for multidomain, and uses basic AV extraction without parameter thresholding or SVD.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Section 7 explicitly states 'our method works only for LLMs with the same architecture' and 'we tested our approach only on Mistral-7b,' providing clear scope boundaries.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment section is present; work is described as done during an internship at AWS AI Labs but no formal funding or grant disclosure appears.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly listed in the header: University of Houston and AWS AI Labs.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Most authors are AWS AI Labs (Amazon) employees evaluating a method they developed; the employer/funder has a direct interest in the outcome being positive.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) appears anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Alignment Vectors are defined with equation 1, preference accuracy is defined in Section 5.1, and the three proficiency levels (Avd/Gen/Exp) are defined with clear instructions in Appendix C.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The contributions section explicitly enumerates three contributions: the inference-time AV method, the synthetic 38K query-response dataset, and the multidomain coefficient-based alignment approach.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 compares the approach against ITI, SRV, DeAl, and controllability methods, explaining specific differences in flexibility, reward model dependency, and inference-time applicability.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No code repository link or release is mentioned anywhere in the paper.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The paper states 'The dataset will be available through this link' with no actual URL — this is a promise of future release, not an actual release.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper mentions NVIDIA A100 GPUs and model version but provides no requirements.txt, Dockerfile, or dependency list.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided; the methodology description is conceptual and mathematical, not operational.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Tables 1-5 and Figures 2-4 are point estimates with no confidence intervals or error bars.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied to any comparative results despite multiple comparative claims across baselines and conditions.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Absolute percentage accuracy differences between conditions are reported throughout (e.g., legal expert preference accuracy 1.0 vs. 0.79 for AV vs. prompting), providing practical effect size context.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The 80/20 train/test split is stated but not justified; no power analysis or sample size rationale is provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or variance across runs is reported for any experimental result.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Two baselines are included: a prompting approach for single-domain alignment and joint training for multidomain, plus a 'default' (no instruction) condition.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Prompt engineering is the natural inference-time comparison; joint training is the standard retraining baseline; both are appropriate and current comparisons for this problem.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "The systematic lambda sweep and out-of-domain transfer experiments in Table 4 function as ablations of the key integration parameter and domain specificity claim.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Two evaluation metrics are used: log-probability-based 'preference accuracy' and 'GPT-4 judged generation accuracy' as an auxiliary metric.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "Human evaluation was used only for dataset construction quality assessment (Cohen's kappa on annotator label agreement), not for evaluating the alignment system's outputs directly.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "The paper uses an 80% training / 20% test split with 3% validation, explicitly separating evaluation from training data.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by domain (medical, financial, legal) and by proficiency level (Avd/Gen/Exp) in separate tables for each domain.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 6.2 explicitly notes that 5 of 27 multidomain behavior combinations cannot be achieved and attributes this to over-generalization across domains.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The paper reports that safety alignment at negative lambda becomes 'inconsistent and mixed,' and that multi-domain continuous tunability fails due to over-generalization effects.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Mistral-7B-Instruct-v0.3 is specified as the base model and Claude-3-Sonnet is named as the data generation model.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Appendices B and C provide the actual prompts used for CreatePersona, PersonaHub query generation, and all three response proficiency levels with template variables shown.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "DPO beta=0.1 with IPO loss, batch size=4, one epoch, lambda interval=0.1, 80/20 train/test split, and 3% validation allocation are all reported.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "This is not an agentic system; no agentic scaffolding is used.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 3 documents the full data pipeline including persona selection, hierarchical persona generation, three-level response generation, and cleanup steps (truncation, reformatting).",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The dataset link is a placeholder with no actual URL; the synthetic dataset cannot be accessed or verified independently.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 3 describes the data collection process in sufficient detail: persona sources, query generation method, response generation with instructions, and human annotation for quality control.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": false,
    282           "justification": "The paper mentions 'three annotators' reviewed 30 queries each but provides no information about their background, expertise, recruitment, or compensation.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The full data pipeline is described: PersonaHub/CreatePersona → LLM query generation → three-level response generation → human quality evaluation → train/validation/test split.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "The paper evaluates on its own synthetically generated dataset rather than standard benchmarks, making training cutoff contamination concerns inapplicable.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": false,
    301           "answer": false,
    302           "justification": "The synthetic data is freshly generated and split by the authors; train/test overlap in the traditional benchmark sense does not apply.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "No standard public benchmarks are used for evaluation; the custom synthetic dataset eliminates traditional benchmark contamination concerns.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human subjects experiment is conducted; the small annotation quality check does not constitute a human study.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants study requiring IRB approval is conducted.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants study is conducted.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants study is conducted.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants study is conducted.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants study is conducted.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants study is conducted.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "The 'inference cost by half' claim is argued theoretically from prompt token count but no actual latency measurements, token counts, or cost benchmarks are reported.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Training is described as approximately 72 hours per DPO job on NVIDIA A100 GPUs, providing an implicit compute budget for the alignment fine-tuning phase.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Alignment Vectors enable fine-grained inference-time preference control across three proficiency levels without additional training or prompt overhead",
    375       "evidence": "Tables 1-3 show lambda sweep successfully achieving Avd, Gen, and Exp dominant behaviors in medical, financial, and legal domains",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "The AV method reduces inference cost by half compared to prompt engineering",
    380       "evidence": "Argued theoretically (instruction prompts are as long as queries, doubling token count) but never empirically measured with latency or cost data",
    381       "supported": "weak"
    382     },
    383     {
    384       "claim": "Multidomain diverse preference alignment is 12x faster than full retraining",
    385       "evidence": "Calculated as 72h × 27 training jobs = 1,944h vs. ~155h for grid search (9,261 evaluations × 60s each); relies on rough estimates, not benchmarked measurements",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "AV from one specialized domain generalizes out-of-domain to other specialized domains",
    390       "evidence": "Table 4 shows medical AV at λ=0.5 produces expert-dominant behavior in financial and legal domains as well",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Domain-specific AV editing does not cause significant regression in general safety or helpfulness alignment",
    395       "evidence": "Table 4 reports safety accuracy changes within ±12% and helpfulness within ±18% range under domain-specific AV integration",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "The model can achieve 22 of 27 possible multidomain behavior combinations through coefficient grid search",
    400       "evidence": "Grid search over 21^3 coefficient combinations is stated to yield 22/27 desired configurations, but raw results are not fully tabulated",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval"
    406   ],
    407   "key_findings": "The paper introduces Alignment Vectors (AV), computed by subtracting base model weights from DPO-aligned model weights, enabling scalar-multiplier control over LLM response proficiency at inference time without retraining. Experiments on Mistral-7B-Instruct-v0.3 with a 38K synthetic dataset across medical, financial, and legal domains show that varying lambda achieves avoidance, generic, and expert response levels, and that AVs generalize across domains. The multidomain approach achieves 22 of 27 possible behavior combinations via grid search, claimed to be 12x faster than training separate models for each configuration, though this estimate relies on rough time assumptions rather than empirical benchmarking.",
    408   "red_flags": [
    409     {
    410       "flag": "Unverified inference cost claim",
    411       "detail": "The 'inference cost by half' claim in the abstract is derived theoretically from prompt token length doubling, not from any measured latency, throughput, or API cost data."
    412     },
    413     {
    414       "flag": "Circular synthetic data evaluation",
    415       "detail": "Responses were generated by Claude-3-Sonnet using proficiency-level prompts, and human annotators evaluated against Claude-3-Sonnet's intended level as 'ground truth,' creating a circular quality assessment of the generation model."
    416     },
    417     {
    418       "flag": "Single model only",
    419       "detail": "All experiments use only Mistral-7B-Instruct-v0.3 despite broad claims about 'LLM alignment'; the limitations section acknowledges this but the main text does not bound claims accordingly."
    420     },
    421     {
    422       "flag": "No statistical testing or error bars",
    423       "detail": "All comparative results are single-run point estimates with no confidence intervals, significance tests, or variance across runs, making it impossible to assess whether observed differences are reliable."
    424     },
    425     {
    426       "flag": "Broken dataset release link",
    427       "detail": "The paper states the dataset 'will be available through this link' with a placeholder — no actual URL is provided, so neither the data nor the claims can be independently verified."
    428     },
    429     {
    430       "flag": "12x speedup based on rough estimates",
    431       "detail": "The 12x training time speedup is calculated from an assumed 72h per training job rather than actual measured training times across all 27 configurations, making the quantitative efficiency claim unreliable."
    432     }
    433   ],
    434   "cited_papers": [
    435     {
    436       "title": "Editing models with task arithmetic",
    437       "relevance": "Foundational method AV directly extends; weight-space subtraction to encode task-specific directions."
    438     },
    439     {
    440       "title": "Direct preference optimization: Your language model is secretly a reward model",
    441       "relevance": "The DPO algorithm used to obtain the aligned model from which alignment vectors are extracted."
    442     },
    443     {
    444       "title": "Training language models to follow instructions with human feedback",
    445       "relevance": "Standard RLHF training-time alignment baseline that inference-time approaches aim to replace."
    446     },
    447     {
    448       "title": "InferAligner: Inference-time alignment for harmlessness through cross-model guidance",
    449       "relevance": "Related inference-time alignment work compared for controllability and input-dependency limitations."
    450     },
    451     {
    452       "title": "DeAl: Decoding-time alignment for large language models",
    453       "relevance": "Related decoding-time alignment approach; compared as a slower search-based alternative."
    454     },
    455     {
    456       "title": "Inference-time intervention: Eliciting truthful answers from a language model",
    457       "relevance": "Related attention-head intervention for inference-time alignment; compared for controllability and applicability scope."
    458     },
    459     {
    460       "title": "Scaling synthetic data creation with 1,000,000,000 personas",
    461       "relevance": "PersonaHub dataset used to source personas for domain-specific query generation."
    462     },
    463     {
    464       "title": "BeaverTails: Towards improved safety alignment of LLM via a human-preference dataset",
    465       "relevance": "PKU-SafeRLHF dataset used for safety dimension AV experiments in Section 6.3."
    466     },
    467     {
    468       "title": "Judging LLM-as-a-judge with MT-bench and chatbot arena",
    469       "relevance": "Basis for the GPT-4-as-judge secondary evaluation metric used throughout the paper."
    470     }
    471   ],
    472   "engagement_factors": {
    473     "practical_relevance": {
    474       "score": 2,
    475       "justification": "Inference-time preference tuning without retraining has direct enterprise value for regulated domains like healthcare and legal, though the single-architecture restriction limits immediate adoption."
    476     },
    477     "surprise_contrarian": {
    478       "score": 1,
    479       "justification": "Weight subtraction for alignment control is an incremental extension of existing task arithmetic; the domain application is novel but the core mechanism is not surprising."
    480     },
    481     "fear_safety": {
    482       "score": 1,
    483       "justification": "The paper shows models can be steered toward avoidance or potentially unsafe behaviors via negative lambda, but this is framed as a feature rather than a safety risk."
    484     },
    485     "drama_conflict": {
    486       "score": 0,
    487       "justification": "No controversial claims or community conflict; straightforward engineering contribution submitted to ICLR 2025."
    488     },
    489     "demo_ability": {
    490       "score": 1,
    491       "justification": "The lambda-based tunable knob concept is easily demonstrable in principle, but no code, model weights, or dataset are released, preventing immediate reproduction."
    492     },
    493     "brand_recognition": {
    494       "score": 1,
    495       "justification": "AWS AI Labs has visibility in the ML community but is not a headline AI research lab driving significant HN attention."
    496     }
    497   },
    498   "hn_data": {
    499     "threads": [
    500       {
    501         "hn_id": "47157759",
    502         "title": "Can Chain-of-Thought Reasoning Solve Any Computable Task?",
    503         "points": 4,
    504         "comments": 1,
    505         "url": "https://news.ycombinator.com/item?id=47157759"
    506       },
    507       {
    508         "hn_id": "38355249",
    509         "title": "Open Problems in DAOs",
    510         "points": 3,
    511         "comments": 0,
    512         "url": "https://news.ycombinator.com/item?id=38355249"
    513       },
    514       {
    515         "hn_id": "42581055",
    516         "title": "Medec: A Benchmark for Medical Error Detection and Correction in Clinical Notes",
    517         "points": 2,
    518         "comments": 0,
    519         "url": "https://news.ycombinator.com/item?id=42581055"
    520       }
    521     ],
    522     "top_points": 4,
    523     "total_points": 9,
    524     "total_comments": 1
    525   }
    526 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs