scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25949B)
      1 {
      2   "paper": {
      3     "title": "Test-Time Matching: Unlocking Compositional Reasoning in Multimodal Models",
      4     "authors": ["Yinglun Zhu", "Jiancheng Zhang", "Fuzhi Tang"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2510.07632",
      8     "doi": "10.48550/arXiv.2510.07632"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Standard GroupScore evaluation metrics systematically underestimate multimodal model capability on compositional reasoning benchmarks. The proposed GroupMatch metric and SimpleMatch procedure reveal hidden capability, enabling GPT-4.1 to surpass estimated human performance on Winoground (91.38 vs 85.5). Test-Time Matching (TTM), an iterative self-training algorithm with decaying threshold schedules, further boosts performance without external supervision, enabling SigLIP-B16 to surpass GPT-4.1 on MMVP-VLM. Results are consistent across 16 dataset variants spanning diverse group structures.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "GitHub repository URL provided: https://github.com/yinglunz/test-time-matching (stated in the abstract header)."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available benchmarks (Winoground, MMVP-VLM, ColorSwap, SugarCrepe, WhatsUp). No proprietary data was collected."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, requirements.txt, Dockerfile, or dependency listings are mentioned in the paper."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. A GitHub link is given but no README or reproduction steps are described in the text."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Standard deviations are reported with ± notation throughout Tables 1, 2, and other results (e.g., 'GPT-4.1 69.75 ± 0.56', 'SigLIP-B16 72.50 ± 0.64')."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests are reported. Claims of improvement are based on comparing point estimates with standard deviations but no formal hypothesis tests (p-values, t-tests, etc.) are used."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Absolute gains (∆), relative gains (%), and relative error reductions are reported throughout Tables 1, 6, 7, and 8 with baseline context (e.g., '+ 5.5 (8.2% ↑) 16.7% ↓')."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for sample sizes of the benchmarks used. The paper inherits benchmark sizes (e.g., Winoground has 400 groups) without discussing whether these are sufficient for the claims made."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Results are averaged over four random runs with standard deviations reported (Section 4.1: 'All results are averaged over four random runs, with standard deviations reported')."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Raw model performance (GroupScore) serves as the baseline, and prior state-of-the-art results are cited (e.g., 58.75 on Winoground from GPT-4V with prompt tuning, 70.7 on MMVP). Multiple models at multiple scales are compared."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include GPT-4.1 (2025), SigLIP, CLIP, and prior SOTA from 2023-2025 (Wu et al., 2023; Vaishnav and Tammet, 2025; Zhang et al., 2024c)."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 4.5 provides ablations: different threshold schedules (constant, ascend, decay), different initial thresholds τ1, oracle matching skyline, and comparison of different evaluation metrics (GroupScore, GlobalMatch, IndividualMatch, GroupMatch) in Figure 4."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics are used: GroupScore, GroupMatch, absolute gains, relative gains, and relative error reductions. Results are reported under both GroupScore and GroupMatch."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation of model outputs is not relevant here — this paper proposes a metric and test-time adaptation method for existing benchmarks that already have ground truth."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "TTM is a test-time method that adapts on the test set itself. There is no held-out test set — the same data used for adaptation is used for evaluation. The paper acknowledges this is the design (self-training at test time), but from an evaluation rigor perspective the test set is not held out."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results broken down by dataset (Winoground, MMVP-VLM, ColorSwap, SugarCrepe subsets, WhatsUp subsets), by model (CLIP-B16, SigLIP-B16, SigLIP-L16, GPT-4.1), and by group structure type (2×2, 1×k, non-grouped)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No qualitative error analysis or failure case examples are provided. The paper does not discuss where TTM fails or what types of compositional reasoning remain challenging after TTM."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Figure 2 (right) shows that the ascending threshold schedule yields no gains ('the model quickly overfits to all pseudo-labels in the first iteration'). The constant threshold schedule also underperforms decay."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims (GPT-4.1 surpassing human performance on Winoground, SigLIP-B16 surpassing GPT-4.1 on MMVP-VLM, gains across 16 dataset variants) are all supported by Tables 1, 2, Figures 1, 3, and Tables 6-8."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper's causal claims are primarily about algorithmic effects (SimpleMatch and TTM improve scores). These are justified through controlled experiments: the same models are evaluated with and without the proposed methods, and ablations in Section 4.5 isolate component contributions."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper carefully scopes claims to 'compositional reasoning' and tests across 16 dataset variants. The Discussion section proposes 'extending TTM beyond compositional reasoning' as future work rather than claiming it already generalizes. The title specifies 'Compositional Reasoning in Multimodal Models'."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for why TTM works beyond the matching/pseudo-label quality narrative. For example, it doesn't consider whether TTM might be overfitting to dataset artifacts, or whether the gains are partly due to data augmentation effects rather than the matching mechanism."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper is careful about what it measures. It explicitly distinguishes between GroupScore and GroupMatch metrics, discusses what each measures, and notes that 'the same model on the same dataset can yield vastly different results under different metrics' (Section 6). Claims match measurement granularity."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "GPT-4.1-2025-04-14 is specified with exact snapshot date (footnote 1). SigLIP-B16, SigLIP-L16, CLIP-B16, CLIP-B32 are specified by architecture variant."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper uses contrastive VLMs (SigLIP, CLIP) with embedding similarity and GPT-4.1 with VQAScore — no prompting in the traditional sense is used by the authors' method."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix B.1 provides comprehensive hyperparameters: T=10 iterations, 20 epochs per iteration (30 for Winoground), AdamW with weight decay 0.05, β values, cosine decay LR schedule, batch sizes. Tables 3-5 give per-dataset/model threshold and learning rate settings."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. TTM is a standard test-time training algorithm."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Data preprocessing is documented: Section 4.1 describes how datasets are used, Section 3.2.1 explains how non-grouped variants are constructed by 'flattening' (removing local k×k groups), and WhatsUp 2×2 variants are constructed following Li et al. (2025). Data augmentation choices documented in Appendix B.1."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No dedicated limitations section. The Discussion (Section 6) mentions future directions but does not discuss limitations of the current work."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed. The paper does not address potential concerns such as overfitting to test data, sensitivity to threshold selection beyond the ablation, or the philosophical question of whether test-time adaptation on the test set constitutes fair evaluation."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No explicit scope boundaries stated. The paper does not state what it does NOT show or what settings TTM would not work in."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "All benchmarks used are publicly available (Winoground, MMVP-VLM, ColorSwap, SugarCrepe, WhatsUp). Code repository is provided for verification of the method."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4.1 describes all datasets used, their structures (2×2, 1×k, non-grouped), and how variants were constructed. All are established public benchmarks with published descriptions."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data sources are standard public benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from raw benchmark data → similarity matrix computation → matching → pseudo-labeling → finetuning is fully described in Sections 3.1-3.2 and Algorithm 1."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding sources or acknowledgments are mentioned in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors are affiliated with University of California, Riverside, clearly stated in the header."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding disclosed, so independence cannot be assessed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The training cutoff for GPT-4.1 is not stated. For CLIP and SigLIP, training data cutoffs are not discussed. Footnote 1 mentions the GPT-4.1 snapshot date but not training data cutoff."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether benchmark images/captions appeared in the training data of CLIP, SigLIP, or GPT-4.1."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "Winoground (2022), MMVP-VLM (2024), SugarCrepe (2023), WhatsUp (2023), ColorSwap (2024) were all published before or around the training of the models used. No contamination analysis is performed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost or latency is reported. TTM requires iterative finetuning at test time (T=10 iterations, 20-30 epochs each) plus GPT-4.1 API calls for VQAScore, but costs are not quantified."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget (GPU hours, API costs, hardware) is reported despite the method requiring repeated finetuning iterations."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Results are averaged over 4 random runs with standard deviations reported (Section 4.1). Variation across runs is visible in the ± values."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section 4.1 explicitly states: 'All results are averaged over four random runs, with standard deviations reported.'"
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported. Tables 3-5 report final hyperparameter settings per dataset/model pair, but the search process that found these configurations is not described."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper provides per-dataset-per-model hyperparameters in Tables 3-5 but does not explain how these were selected. The ablation in Figure 4 (right) shows sensitivity to τ1, but no validation set selection procedure is described."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No significance tests are performed at all, so no multiple comparison correction either. The paper reports many comparisons across 16 dataset variants and multiple models."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors propose TTM and evaluate it themselves. No acknowledgment of author-evaluation bias, no independent evaluation."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "TTM requires iterative finetuning (10 iterations × 20-30 epochs) while raw model evaluation requires a single forward pass. This massive compute difference is not discussed or compared."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "A core contribution of the paper is questioning whether GroupScore actually measures what it claims (Section 3.1). Propositions 1 and 2 formally analyze random guessing probabilities under different metrics, and Figure 4 (left) compares evaluation metrics."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved. TTM is a direct test-time training method, not a scaffolded agent system."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether benchmark data (Winoground 2022, SugarCrepe 2023, etc.) was in the training data of models trained after those dates."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the group structure of benchmarks (pairing images/captions together) provides information leakage. The matching-based approach explicitly exploits group structure at test time, but the fairness of this is not discussed."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether benchmark examples share structural similarities or whether groups within datasets are independent."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods are applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Standard GroupScore metrics systematically underestimate model capability on compositional reasoning benchmarks.",
    365       "evidence": "Propositions 1 and 2 (Section 3.1) prove that random guessing probability under GroupScore is (k-1)!/(2k-1)! vs 1/k! under GroupMatch. For k=2, this is 1/6 vs 1/2. Empirical validation in Figure 1 and Table 1 shows large gaps between raw GroupScore and GroupMatch-induced performance.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "SimpleMatch enables GPT-4.1 to surpass estimated human performance on Winoground (91.38 vs 85.5).",
    370       "evidence": "Table 1 shows GPT-4.1 achieves 91.38 ± 0.80 under GroupMatch vs raw GroupScore of 69.75 ± 0.56. Human performance estimate of 85.5 from Thrush et al. (2022).",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "TTM enables SigLIP-B16 to surpass GPT-4.1 on MMVP-VLM, establishing a new state of the art (89.44 vs 88.52).",
    375       "evidence": "Table 1: SigLIP-B16 with TTM achieves 89.44 ± 0.96 vs GPT-4.1 SimpleMatch at 88.52 ± 0.83 on MMVP-VLM.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "TTM delivers substantial improvements even on benchmarks without metric-induced effects (1×k groups), with up to 85.7% relative gains on WhatsUp.",
    380       "evidence": "Figure 3 and Table 7: CLIP-B32 improves from 30.58 to 56.8 on WhatsUp A (85.7% relative gain).",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "TTM generalizes to non-grouped settings via global matching formulation.",
    385       "evidence": "Table 2 shows gains on non-grouped variants: e.g., ColorSwap SigLIP-B16 from 88.00 (SimpleMatch) to 92.00 (TTM), 33.3% error reduction.",
    386       "supported": "moderate"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Test-time adaptation on test set blurs evaluation boundaries",
    392       "detail": "TTM finetunes models directly on the test set using pseudo-labels derived from the test set. While the paper frames this as 'test-time training', it fundamentally means the model is adapted to the specific test examples before being evaluated on them. This is acknowledged as the method's design but raises questions about fair comparison with methods that do not use the test set for adaptation."
    393     },
    394     {
    395       "flag": "Per-dataset hyperparameter tuning",
    396       "detail": "Tables 3-5 show different hyperparameters (thresholds, learning rates, schedules) tuned per dataset and per model. With 16 dataset variants and multiple models, there are many configuration choices, and no validation set selection procedure is described. This raises concerns about overfitting the method to specific benchmarks."
    397     },
    398     {
    399       "flag": "No contamination analysis",
    400       "detail": "Several benchmarks (Winoground 2022, SugarCrepe 2023) predate the training of models used. No analysis of whether benchmark data appeared in training sets. This is especially relevant since the paper claims models have 'hidden capability' on these benchmarks."
    401     },
    402     {
    403       "flag": "No compute cost comparison",
    404       "detail": "TTM requires 10 iterations of finetuning (20-30 epochs each) versus a single forward pass for raw evaluation. This massive compute overhead is never quantified or discussed, making practical applicability hard to assess."
    405     },
    406     {
    407       "flag": "Comparison across different metrics may mislead",
    408       "detail": "The paper compares GPT-4.1 SimpleMatch (GroupMatch) against prior SOTA under GroupScore and human performance estimated under GroupScore. Comparing numbers from different evaluation metrics requires careful interpretation that may not be obvious to readers."
    409     }
    410   ],
    411   "cited_papers": [
    412     {
    413       "title": "Are emergent abilities of large language models a mirage?",
    414       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    415       "year": 2023,
    416       "relevance": "Questions whether emergent abilities are artifacts of evaluation metrics — directly relevant to this paper's thesis about metric sensitivity."
    417     },
    418     {
    419       "title": "On the measure of intelligence",
    420       "authors": ["François Chollet"],
    421       "year": 2019,
    422       "arxiv_id": "1911.01547",
    423       "relevance": "Proposes ARC benchmark for measuring intelligence; TTM's test-time training approach connects to work on ARC."
    424     },
    425     {
    426       "title": "The surprising effectiveness of test-time training for few-shot learning",
    427       "authors": ["Ekin Akyürek", "Mehul Damani", "Adam Zweiger"],
    428       "year": 2025,
    429       "relevance": "Key prior work on test-time training that TTM builds upon and contrasts with (per-instance vs whole-dataset adaptation)."
    430     },
    431     {
    432       "title": "Test-time training on nearest neighbors for large language models",
    433       "authors": ["Moritz Hardt", "Yu Sun"],
    434       "year": 2024,
    435       "relevance": "Test-time training approach for LLMs that TTM extends with matching-based pseudo-labeling."
    436     },
    437     {
    438       "title": "GPT-4 technical report",
    439       "authors": ["Josh Achiam"],
    440       "year": 2023,
    441       "arxiv_id": "2303.08774",
    442       "relevance": "Foundation model used in the evaluation; relevant to LLM capability assessment."
    443     },
    444     {
    445       "title": "Evaluating text-to-visual generation with image-to-text generation",
    446       "authors": ["Zhiqiu Lin"],
    447       "year": 2024,
    448       "relevance": "VQAScore metric used to compute similarity for MLLMs in this paper's experiments."
    449     },
    450     {
    451       "title": "Winoground: Probing vision and language models for visio-linguistic compositionality",
    452       "authors": ["Tristan Thrush"],
    453       "year": 2022,
    454       "relevance": "Primary benchmark used; defines the GroupScore metric that this paper critiques."
    455     },
    456     {
    457       "title": "Eyes wide shut? Exploring the visual shortcomings of multimodal LLMs",
    458       "authors": ["Shengbang Tong"],
    459       "year": 2024,
    460       "relevance": "Introduces MMVP-VLM benchmark used in evaluation and documents visual shortcomings of MLLMs."
    461     },
    462     {
    463       "title": "Learning transferable visual models from natural language supervision",
    464       "authors": ["Alec Radford"],
    465       "year": 2021,
    466       "relevance": "CLIP model used as a baseline; foundational contrastive VLM."
    467     },
    468     {
    469       "title": "Sigmoid loss for language image pre-training",
    470       "authors": ["Xiaohua Zhai"],
    471       "year": 2023,
    472       "relevance": "SigLIP model used as primary contrastive VLM in experiments."
    473     }
    474   ]
    475 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs