scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (31113B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "ELLA: Equip Diffusion Models with LLM for Enhanced Semantic Alignment",
      6     "authors": [
      7       "Xiwei Hu",
      8       "Rui Wang",
      9       "Yixiao Fang",
     10       "Bin Fu",
     11       "Pei Cheng",
     12       "Gang Yu"
     13     ],
     14     "year": 2024,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2403.05135",
     17     "doi": "10.48550/arXiv.2403.05135"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All abstract claims about ELLA's lightweight adapter, TSC design, and superior dense prompt following are substantiated by detailed method descriptions, ablation studies, and experimental results on T2I-CompBench and DPG-Bench.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Causal claims are justified through ablation studies on architecture components (Table 6), LLM selection (Table 5), user studies validating automatic metrics (Fig 5), and attention visualizations. Comparisons to competitive baselines (SDXL, PixArt-α, DALL-E 3) support claims of improvement.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Generalizations are bounded to tested settings: dense prompt scenarios, text-to-image generation, and integration with Stable Diffusion-based models. Authors acknowledge limitations with MLLM caption weaknesses (shape, spatial relationships) and frozen U-Net constraints. Testing spans multiple benchmarks and community models.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper presents one interpretation for each result without discussing alternatives. For instance, TSC's superiority over AdaLN-Zero is shown empirically but not explained. The paper assumes LLMs help because of 'better language understanding' without exploring if gains come from other factors like larger embeddings or capacity.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Paper clearly distinguishes automated metrics (mPLUG-based VQA) from human evaluation (user study ranking). User study results (Fig 5) validate that automated DPG-Bench scoring correlates with human perception of semantic alignment, supporting the proxy validity.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "A dedicated limitations section exists in Section 6 (Conclusion and Limitation) discussing MLLM caption biases and frozen U-Net constraints, though it is brief.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "Limitations discussed are about method constraints (MLLM caption weaknesses, frozen U-Net) rather than experimental validity. No specific threats are addressed: user study sample size (n=20), potential bias in MLLM-based evaluation metrics, or whether baseline implementations are optimal.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper does not explicitly state scope boundaries. While experiments focus on dense prompts and Stable Diffusion variants, there is no dedicated discussion of what ELLA is NOT designed for (e.g., other modalities, non-diffusion models, real-time constraints).",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding source is disclosed. The paper lists Tencent as affiliation but does not state whether Tencent funded this work or if it was supported by grants/external sources.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations are clearly disclosed (all Tencent-affiliated), though competing interests statement is absent. The comparison includes OpenAI (DALL-E 3) and community models, with no declared conflicts.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Funding source is not disclosed, so cannot assess independence. If Tencent funded this work, there would be a potential interest in ELLA outperforming baselines.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests, patents, or financial relationships are declared. The paper includes no COI statement.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Key terms like 'dense prompts,' 'timestep-aware,' and 'TSC' are explained in context (Section 3.1, 4). 'Semantic alignment' is defined operationally through benchmarks. Standard ML terms (denoising, text encoder) are assumed known. Definitions are adequate for the target audience.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Contributions are explicitly listed in introduction: (1) lightweight ELLA adapter without U-Net/LLM training, (2) TSC design, (3) DPG-Bench for dense prompts, (4) empirical superiority. Each contribution is clear and the paper demonstrates all of them.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Related work section engages with prior approaches, distinguishing ELLA's lightweight adapter design (no U-Net training) from full-training methods like ParaDiffusion and Imagen. Connection to training-free compositional methods noted. Engagement is present though could be deeper in explaining novelty.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "The paper provides a website URL (ella-diffusion.github.io) but does not explicitly state source code is released. No GitHub repository, HuggingFace link, or direct code availability statement is provided in the paper.",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": false,
    131           "justification": "DPG-Bench is described but not stated as released. Custom CogVLM-annotated training data (30M captions on LAION/COYO) is not mentioned as available. Only existing public datasets (LAION, COYO, JourneyDB) are used for training.",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "Training hyperparameters and hardware are specified, but no environment/dependency specs provided (no requirements.txt, Python version, PyTorch/CUDA versions, or Dockerfile). Reproducibility requires external knowledge of dependencies.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "The paper describes method and training procedure but provides no step-by-step reproduction instructions. No training commands, inference scripts, or evaluation code provided. Reproducibility requires substantial reverse-engineering from method description.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "Main results (Tables 3-6) report single scores without error bars or confidence intervals. User study (Fig 5) shows win percentages without CIs. Variance across runs or evaluation instances is not reported.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "Comparative claims throughout (e.g., ELLASDXL outperforms baselines) lack statistical significance tests. User study (Fig 5) reports win percentages without significance tests. No p-values or statistical tests provided.",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Effect sizes can be calculated from reported scores (e.g., ELLASDXL 80.23 vs SDXL 74.65 = 7.5% relative improvement on DPG-Bench). User study reports win/tie/loss percentages. Explicit effect size metrics (Cohen's d) not provided, but improvements are quantifiable.",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "Sample sizes (1,065 prompts, 20 users per prompt) are used but not justified. No power analysis provided. Ablation study acknowledges using fewer training steps due to compute constraints but does not justify the chosen sample sizes.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "No variance metrics reported. Tables 3-6 show single scores without error bars, standard deviations, or multiple runs. User study (Fig 5) shows win distributions but no confidence intervals. No indication of variance across repeated evaluations.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Multiple strong baselines included: Stable Diffusion variants, SDXL, PixArt-α, DALL-E 3, and compositional generation methods. Comparisons span both short prompts (T2I-CompBench) and dense prompts (DPG-Bench).",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Baselines are contemporary (2022-2024, close to paper date of March 2024). Includes SDXL, PixArt-α, DALL-E 3, and recent open-source models. Appropriately strong comparisons for dense prompt evaluation.",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Ablation studies on LLM choice (Table 5) and module architecture (Table 6) test key design decisions. Timestep awareness is validated through attention visualization (Fig 8). Ablations justify selection of T5-XL and AdaLN-based TSC.",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Multiple evaluation metrics across benchmarks: T2I-CompBench evaluates attribute binding, color, shape, texture, spatial relations. DPG-Bench provides global, entity, attribute, relation scores. User study adds human judgments on semantic alignment and aesthetic quality.",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "User study (20 users per prompt) evaluates semantic alignment and aesthetic quality of generated images. Results (Fig 5) show human preference rankings align with automated DPG-Bench scores, validating the automatic metric.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "DPG-Bench and T2I-CompBench are held-out from training data (LAION/COYO/JourneyDB). Evaluation on separate benchmark data provides test set separation, though no explicit verification that benchmarks do not overlap with web-scraped training sources.",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "DPG-Bench results (Table 4) broken down by five categories (Global, Entity, Attribute, Relation, Other). T2I-CompBench (Table 3) shows per-attribute breakdown (Color, Shape, Texture, Spatial/Non-Spatial relationships).",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": false,
    225           "justification": "No failure cases shown or discussed. Limitations acknowledge weaknesses (MLLM caption biases, frozen U-Net constraints) but do not demonstrate specific failure scenarios or outputs where ELLA underperforms.",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": false,
    231           "justification": "No negative results reported. All results support ELLA's effectiveness. Ablations show design choices (AdaLN vs AdaLN-Zero) but do not discuss failed approaches or techniques that were abandoned.",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "Model names clearly specified (T5-XL 1.2B, SDv1.5, SDXL, LLaMA-2 13B, TinyLlama). While no exact snapshot versions given, published models are identifiable. Acceptable for reproducibility with standard published models.",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": false,
    245           "justification": "Example prompts shown in qualitative results (Figs 4, 6, Table 4 footnote) but full prompt sets not provided. DPG-Bench described as created by GPT-4 but prompts not included in paper or linked repository statement. Would need external access to reproduce.",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Key training hyperparameters reported: AdamW optimizer, learning rates (1e-4, 1e-5), weight decay, training steps, resolution, token length. Some details missing (batch size, scheduler) but sufficient for partial reproduction with standard defaults.",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "No agentic scaffolding or explicit prompting tactics used. Models evaluated as text-to-image generators without planning/decomposition. Technical architecture (TSC) detailed but not agentic scaffolding.",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Data preprocessing documented: aesthetic filtering (score>6, 512px min), CogVLM caption generation, and dataset composition (34M LAION/COYO pairs, 4M JourneyDB). DPG-Bench creation process described (GPT-4 generation, human verification). Sufficient for understanding approach, though some details could be more explicit.",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "Training data (LAION, COYO, JourneyDB) is publicly available, but specific 34M filtered pairs and CogVLM annotations are not released. DPG-Bench data not stated as released. Independent verification would require access to exact dataset subsets used.",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Data collection described for primary dataset (LAION/COYO filtered + JourneyDB) and for DPG-Bench (prompts created by GPT-4 from existing sources, human-verified). References external sources for detailed LAION/COYO collection procedures.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": true,
    282           "answer": false,
    283           "justification": "User study recruitment not described. No information on how 20 users per prompt were recruited, compensated, or selected. Standard benchmark datasets (LAION, COCO) do not require recruitment description.",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "Data pipeline documented from collection (source datasets) through filtering (aesthetic score), annotation (CogVLM captions), training data assembly (34M+100k pairs), and evaluation procedure (automatic metrics + user study). Sufficient for understanding, though some details like exact CogVLM prompts not fully specified.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "Training cutoffs for T5-XL, SDXL/SDv1.5, or LLaMA-2 not stated. No discussion of whether pretrained models might have encountered LAION/COYO data during their training. Contamination risk not addressed.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "No discussion of potential overlap between LAION/COYO training data and DPG-Bench/T2I-CompBench evaluation data. Risk of pretrained T5/SDXL models having seen benchmark examples not addressed.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "No discussion of whether evaluation benchmarks (DPG-Bench, T2I-CompBench) might overlap with LAION/COYO training sources. Risk of contamination from web-scraped training data to publicly available benchmarks not addressed.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "User study is crowdsourced image ranking, not human subjects research with experimental conditions. No pre-registration needed or applicable.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human subject research; image ranking crowdsourcing does not require IRB approval.",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "Demographics not applicable; crowdsourced rankers not profiled.",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "Not applicable to crowdsourced image ranking.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "Not applicable; no randomization in image ranking task.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "Not applicable to image ranking.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "Not applicable; no attrition tracking in crowdsourced evaluation.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "No inference cost or latency reported. Training cost mentioned (7-14 days on 8x A100) but inference cost not stated. Computational requirements for deployment not discussed.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": true,
    367           "justification": "Training compute budget stated: 8x 40GB A100 GPUs, ~7 days for ELLASDv1.5, ~14 days for ELLASDXL. Noted as <80% of PixArt-α training cost (753 A100 GPU days).",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "ELLA improves dense prompt following without training U-Net or LLM",
    376       "evidence": "Table 4 shows ELLASDXL (80.23) outperforms SDXL (74.65) and PixArt-α (71.11) on DPG-Bench; only TSC is trained (0.47B parameters vs 2.61B for SDXL)",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Timestep-aware semantic features improve dense prompt understanding",
    381       "evidence": "Ablation Table 6 shows Resampler+AdaLN (TSC) outperforms Resampler without timestep and AdaLN-Zero variant; Fig 8 visualization shows attention shifts across timesteps corresponding to semantic content",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "DPG-Bench is a valid evaluation metric correlating with human judgment",
    386       "evidence": "Fig 5 user study shows human preferences align with DPG-Bench scores (62.82% ELLA wins on semantic alignment); 20 users per prompt rank images",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "MLLM-generated captions improve training over alt-text",
    391       "evidence": "Table 1 shows CogVLM captions have 5x more nouns, adjectives, prepositions than LAION/COYO alt-text; training uses 30M annotated pairs",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "T5-XL outperforms CLIP as text encoder for dense prompts",
    396       "evidence": "Table 5 shows T5-XL (71.70) outperforms CLIP (63.18) on DPG-Bench; LLaMA-2 (72.05) and TinyLlama (70.27) also outperform CLIP",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "ELLA integrates seamlessly with community models and downstream tools",
    401       "evidence": "Fig 7 shows ELLA combined with 6 CivitAI models (ReV Animated, Flat-2D, Animerge, Counterfeit, Realistic Vision, DreamShaper) improves prompt following while maintaining style",
    402       "supported": "strong"
    403     }
    404   ],
    405   "methodology_tags": [
    406     "benchmark-eval",
    407     "case-study"
    408   ],
    409   "key_findings": "ELLA successfully equips CLIP-based diffusion models with language understanding from large language models via a lightweight, frozen adapter (TSC) that dynamically adjusts semantic features across diffusion timesteps. Without training the base U-Net or LLM, ELLA achieves 80.23 on the new DPG-Bench (dense prompts), outperforming open-source baselines (SDXL 74.65, PixArt-α 71.11) and approaching DALL-E 3 (83.50). A user study validates that automatic DPG-Bench metrics correlate with human judgment of semantic alignment, and ELLA successfully integrates with 6+ community models to enhance their prompt-following capabilities.",
    410   "red_flags": [
    411     {
    412       "flag": "No statistical significance testing",
    413       "detail": "Comparative claims (ELLA vs SDXL, PixArt-α) lack p-values or significance tests. User study win rates (62.82%) not tested for significance."
    414     },
    415     {
    416       "flag": "Limited error reporting",
    417       "detail": "Single point estimates throughout; no error bars, confidence intervals, or variance across runs. Reproducibility of reported scores unclear."
    418     },
    419     {
    420       "flag": "Evaluation metric bias",
    421       "detail": "Automatic evaluation uses mPLUG-large VQA, which may be biased toward certain aesthetic qualities. Heavy reliance on MLLM evaluation with no robustness checks."
    422     },
    423     {
    424       "flag": "Dataset contamination not addressed",
    425       "detail": "Training uses public web data (LAION, COYO); potential overlap with benchmarks not discussed. Pretrained model training cutoffs not stated."
    426     },
    427     {
    428       "flag": "No code/data release confirmed",
    429       "detail": "Website URL provided but no explicit statement of code or DPG-Bench release. Reproducibility limited without code."
    430     },
    431     {
    432       "flag": "Failure modes not discussed",
    433       "detail": "Paper shows only successes. Limitations mention MLLM caption weaknesses (shape, spatial relations) but no failure cases demonstrated."
    434     },
    435     {
    436       "flag": "Sample size not justified",
    437       "detail": "DPG-Bench (1,065 prompts) and user study (20 users per prompt) sizes not justified via power analysis or prior work."
    438     },
    439     {
    440       "flag": "Training data synthesis not ablated",
    441       "detail": "30M training captions synthesized by CogVLM; no ablation showing benefit over original alt-text beyond vocabulary analysis in Table 1."
    442     }
    443   ],
    444   "cited_papers": [
    445     {
    446       "title": "Imagen: Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding",
    447       "authors": "Saharia et al.",
    448       "year": 2022,
    449       "relevance": "Prior work using LLM for text-to-image; requires full U-Net fine-tuning, motivating lightweight adapter approach"
    450     },
    451     {
    452       "title": "ParaDiffusion: Paragraph-to-Image Generation with Information-Enriched Diffusion Model",
    453       "authors": "Wu et al.",
    454       "year": 2023,
    455       "relevance": "Alternative approach to dense prompt understanding via LLaMA fine-tuning; ELLA avoids expensive LLM retraining"
    456     },
    457     {
    458       "title": "PixArt-α: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis",
    459       "authors": "Chen et al.",
    460       "year": 2023,
    461       "relevance": "Baseline using T5 encoder but requires full model training from scratch; comparison point for efficiency"
    462     },
    463     {
    464       "title": "DALL-E 3: Improving Image Generation with Better Captions",
    465       "authors": "Betker et al.",
    466       "year": 2023,
    467       "relevance": "SOTA closed-source system with superior performance (83.50 DPG-Bench vs ELLA 80.23); comparison for human preference alignment"
    468     },
    469     {
    470       "title": "T2I-CompBench: A Comprehensive Benchmark for Open-World Compositional Text-to-Image Generation",
    471       "authors": "Huang et al.",
    472       "year": 2024,
    473       "relevance": "Short prompt evaluation benchmark; evaluates attribute binding and object relationships tested in Table 3"
    474     },
    475     {
    476       "title": "Flamingo: a visual language model for few-shot learning",
    477       "authors": "Alayrac et al.",
    478       "year": 2022,
    479       "relevance": "Perceiver Resampler design adopted as basis for TSC architecture in ELLA"
    480     },
    481     {
    482       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    483       "authors": "Hu et al.",
    484       "year": 2021,
    485       "relevance": "Lightweight adaptation technique; mentioned as compatible downstream tool integrated with ELLA (Fig 7)"
    486     },
    487     {
    488       "title": "CogVLM: Visual Expert for Pretrained Language Models",
    489       "authors": "Wang et al.",
    490       "year": 2023,
    491       "relevance": "MLLM used for synthetic dense caption generation in dataset construction; produces 30M annotated training pairs"
    492     }
    493   ],
    494   "engagement_factors": {
    495     "practical_relevance": {
    496       "score": 2,
    497       "justification": "ELLA improves prompt following for community models, but requires training TSC (70M-470M params), limiting immediate practical use without public release. Frozen U-Net design enables integration with existing tools."
    498     },
    499     "surprise_contrarian": {
    500       "score": 1,
    501       "justification": "Using LLMs for text-to-image is known (Imagen, DALL-E 3). The lightweight adapter approach is incremental, not contrarian to the field."
    502     },
    503     "fear_safety": {
    504       "score": 0,
    505       "justification": "Text-to-image generation paper with no explicit AI safety, security, or bias concerns discussed."
    506     },
    507     "drama_conflict": {
    508       "score": 0,
    509       "justification": "Technical contribution paper; no controversy, conflicting claims, or dramatic findings."
    510     },
    511     "demo_ability": {
    512       "score": 1,
    513       "justification": "Approach is trainable and demonstrates improvements on benchmarks, but no released model or public demo confirmed in paper."
    514     },
    515     "brand_recognition": {
    516       "score": 1,
    517       "justification": "Tencent research published on arXiv. Not from top-tier AI labs (OpenAI, DeepMind, Meta, Google) but reputable industrial research."
    518     }
    519   },
    520   "hn_data": {
    521     "threads": [
    522       {
    523         "hn_id": "45323027",
    524         "title": "The Beginner's Textbook for Fully Homomorphic Encryption",
    525         "points": 251,
    526         "comments": 46,
    527         "url": "https://news.ycombinator.com/item?id=45323027"
    528       },
    529       {
    530         "hn_id": "43460455",
    531         "title": "Every Flop Counts: Scaling a 300B LLM Without Premium GPUs",
    532         "points": 117,
    533         "comments": 9,
    534         "url": "https://news.ycombinator.com/item?id=43460455"
    535       },
    536       {
    537         "hn_id": "43477150",
    538         "title": "Scaling a 300B Mixture-of-Experts LING LLM Without Premium GPUs",
    539         "points": 2,
    540         "comments": 0,
    541         "url": "https://news.ycombinator.com/item?id=43477150"
    542       },
    543       {
    544         "hn_id": "41500876",
    545         "title": "End-to-End Quantum Simulation of a Chemical System",
    546         "points": 2,
    547         "comments": 0,
    548         "url": "https://news.ycombinator.com/item?id=41500876"
    549       },
    550       {
    551         "hn_id": "38950373",
    552         "title": "InseRF: Text-Driven Generative Object Insertion in Neural 3D Scenes",
    553         "points": 2,
    554         "comments": 0,
    555         "url": "https://news.ycombinator.com/item?id=38950373"
    556       },
    557       {
    558         "hn_id": "35138597",
    559         "title": "Rewarding Chatbots for Real-World Engagement with Millions of Users",
    560         "points": 1,
    561         "comments": 2,
    562         "url": "https://news.ycombinator.com/item?id=35138597"
    563       },
    564       {
    565         "hn_id": "36898761",
    566         "title": "Rewarding Chatbots for Real-World Engagement with Millions of Users",
    567         "points": 1,
    568         "comments": 1,
    569         "url": "https://news.ycombinator.com/item?id=36898761"
    570       },
    571       {
    572         "hn_id": "40619823",
    573         "title": "Air Gap: Protecting Privacy-Conscious Conversational Agents",
    574         "points": 1,
    575         "comments": 0,
    576         "url": "https://news.ycombinator.com/item?id=40619823"
    577       },
    578       {
    579         "hn_id": "39430857",
    580         "title": "Personalized Language Modeling from Personalized Human Feedback",
    581         "points": 1,
    582         "comments": 0,
    583         "url": "https://news.ycombinator.com/item?id=39430857"
    584       },
    585       {
    586         "hn_id": "39066423",
    587         "title": "Asynchronous Local-SGD Training for Language Modeling",
    588         "points": 1,
    589         "comments": 0,
    590         "url": "https://news.ycombinator.com/item?id=39066423"
    591       }
    592     ],
    593     "top_points": 251,
    594     "total_points": 379,
    595     "total_comments": 58
    596   }
    597 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs