scan.json (23557B)
1 { 2 "scan_version": 3, 3 "active_modules": [ 4 "experimental_rigor", 5 "data_leakage" 6 ], 7 "paper": { 8 "title": "3DShape2VecSet: A 3D Shape Representation for Neural Fields and Generative Diffusion Models", 9 "authors": [ 10 "Biao Zhang", 11 "Jiapeng Tang", 12 "Matthias Niessner", 13 "Peter Wonka" 14 ], 15 "year": 2023, 16 "venue": "ACM Transactions on Graphics (SIGGRAPH)", 17 "arxiv_id": "2301.11445" 18 }, 19 "methodology_tags": [ 20 "benchmark-eval" 21 ], 22 "claims": [ 23 { 24 "claim": "3DShape2VecSet achieves state-of-the-art shape autoencoding on ShapeNet, outperforming OccNet, ConvOccNet, IF-Net, and 3DILG across IoU, Chamfer distance, and F-Score.", 25 "evidence": "Table 3 shows mean IoU 0.965 (Point Queries) vs 0.953 (3DILG), mean Chamfer 0.038 vs 0.040, mean F-Score 0.970 vs 0.966 across all 55 categories.", 26 "supported": "strong" 27 }, 28 { 29 "claim": "The latent set diffusion model achieves state-of-the-art unconditional 3D shape generation on full ShapeNet.", 30 "evidence": "Table 6 shows Surface-FPD 0.76 (C0=32) vs 1.89 (3DILG), Rendering-FID 17.08 vs 24.83 (3DILG). Table 7 shows large margin over PVD (FPD 0.63 vs 2.33).", 31 "supported": "strong" 32 }, 33 { 34 "claim": "Point Queries outperform Learned Queries for shape encoding.", 35 "evidence": "Table 3 shows Point Queries consistently better across all 7 categories and all 3 metrics.", 36 "supported": "strong" 37 }, 38 { 39 "claim": "First demonstration of text-conditioned 3D shape generation using diffusion models.", 40 "evidence": "Section 8.4 with qualitative results in Fig. 11 only. No quantitative metrics provided.", 41 "supported": "weak" 42 }, 43 { 44 "claim": "Category-conditioned generation shows improved recall over competing methods while maintaining comparable precision.", 45 "evidence": "Table 9: recall 0.86 for chair vs 0.65 (3DILG) and 0.57 (NeuralWavelet); recall 0.89 for table vs 0.59 (3DILG).", 46 "supported": "moderate" 47 } 48 ], 49 "key_findings": "3DShape2VecSet proposes a novel 3D shape representation encoding neural fields as a fixed-size set of latent vectors without explicit spatial coordinates, processed via cross-attention. On ShapeNet-v2, it achieves state-of-the-art shape autoencoding (IoU 0.965 vs prior 0.953) and unconditional generation (FPD 0.76 vs 1.89). The paper demonstrates multiple conditional generation applications including text-, category-, image-, and point-cloud-conditioned shape generation using latent diffusion.", 50 "red_flags": [ 51 { 52 "flag": "No uncertainty quantification", 53 "detail": "All results across Tables 3-9 are single point estimates with no error bars, standard deviations, or multi-seed results. Impossible to assess whether improvements are within noise." 54 }, 55 { 56 "flag": "No statistical significance testing", 57 "detail": "Comparative claims ('our results are best', 'beat PVD by a large margin') made without any statistical tests." 58 }, 59 { 60 "flag": "Text-conditioned generation claim unsupported quantitatively", 61 "detail": "Section 8.4 claims 'first text-conditioned 3D generation using diffusion models' with only qualitative figure comparisons, no quantitative metrics." 62 }, 63 { 64 "flag": "Generalization bounded only to ShapeNet", 65 "detail": "All experiments on ShapeNet-v2 (synthetic man-made objects) but claims framed broadly without bounding to this dataset." 66 } 67 ], 68 "checklist": { 69 "artifacts": { 70 "code_released": { 71 "applies": true, 72 "answer": true, 73 "justification": "Code URL provided in abstract: 'Code: https://1zb.github.io/3DShape2VecSet/'" 74 }, 75 "data_released": { 76 "applies": true, 77 "answer": true, 78 "justification": "Uses publicly available ShapeNet-v2, 3D-R2N2 renderings, and ShapeGlot text prompts. All standard public benchmarks." 79 }, 80 "environment_specified": { 81 "applies": true, 82 "answer": false, 83 "justification": "Hardware mentioned (8 A100 for autoencoder, 4 A100 for diffusion) but no requirements.txt, Dockerfile, or library version specifications provided in the paper." 84 }, 85 "reproduction_instructions": { 86 "applies": true, 87 "answer": false, 88 "justification": "No step-by-step reproduction instructions in the paper. Implementation details in Section 7.3 but no runnable commands or README-style instructions." 89 } 90 }, 91 "statistical_methodology": { 92 "confidence_intervals_or_error_bars": { 93 "applies": true, 94 "answer": false, 95 "justification": "All tables (3-9) report only point estimates. No confidence intervals, error bars, or ± notation found." 96 }, 97 "significance_tests": { 98 "applies": true, 99 "answer": false, 100 "justification": "Claims of improvement are based solely on comparing numbers without any statistical significance tests." 101 }, 102 "effect_sizes_reported": { 103 "applies": true, 104 "answer": true, 105 "justification": "Tables provide absolute metric values for both proposed method and baselines (e.g., IoU 0.965 vs 0.953, FPD 0.76 vs 1.89), allowing readers to assess magnitude of improvement in context." 106 }, 107 "sample_size_justified": { 108 "applies": true, 109 "answer": false, 110 "justification": "No justification for dataset size or number of generated samples used for evaluation metrics." 111 }, 112 "variance_reported": { 113 "applies": true, 114 "answer": false, 115 "justification": "All results appear to be from single runs. No standard deviations, variance across seeds, or multiple-run results reported." 116 } 117 }, 118 "evaluation_design": { 119 "baselines_included": { 120 "applies": true, 121 "answer": true, 122 "justification": "Multiple baselines: OccNet, ConvOccNet, IF-Net, 3DILG for autoencoding; PVD, 3DILG, NeuralWavelet, Grid-83, 3DShapeGen, AutoSDF for generation (Section 7.1)." 123 }, 124 "baselines_contemporary": { 125 "applies": true, 126 "answer": true, 127 "justification": "Baselines include 3DILG (2022), NeuralWavelet (2022), PVD (2021) — all contemporary for a 2023 paper." 128 }, 129 "ablation_study": { 130 "applies": true, 131 "answer": true, 132 "justification": "Table 4 ablates M (512, 256, 128, 64); Table 5 ablates C0 (1-64); Table 3 compares Learned vs Point Queries." 133 }, 134 "multiple_metrics": { 135 "applies": true, 136 "answer": true, 137 "justification": "Autoencoding: IoU, Chamfer, F-Score. Generation: FPD, KPD, FID, KID, Precision, Recall, MMD-CD, MMD-EMD, COV-CD, COV-EMD." 138 }, 139 "human_evaluation": { 140 "applies": true, 141 "answer": false, 142 "justification": "No human evaluation of generated shape quality. For generative modeling, human perceptual evaluation is relevant but was not included." 143 }, 144 "held_out_test_set": { 145 "applies": true, 146 "answer": true, 147 "justification": "Section 7 states 'We use the training/val splits in [Zhang et al. 2022].' Section 8.1 references 'test split.'" 148 }, 149 "per_category_breakdown": { 150 "applies": true, 151 "answer": true, 152 "justification": "Table 3 shows per-category results for 7 largest ShapeNet categories plus overall mean. Tables 8-9 show per-category generation results." 153 }, 154 "failure_cases_discussed": { 155 "applies": true, 156 "answer": true, 157 "justification": "Section 4 states 'We initially explored many variations... Ultimately, we could not improve on existing irregular grids.' Section 8.8 discusses limitations of two-stage training." 158 }, 159 "negative_results_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "C0=64 gives worse generation results than C0=32 (Table 6). Section 4 reports that tri-planes, frequency compositions, and factored representations failed to improve over irregular grids." 163 } 164 }, 165 "claims_and_evidence": { 166 "abstract_claims_supported": { 167 "applies": true, 168 "answer": true, 169 "justification": "Abstract claims of 'improved performance in 3D shape encoding and generative modeling' supported by Tables 3-9. All claimed applications demonstrated." 170 }, 171 "causal_claims_justified": { 172 "applies": true, 173 "answer": true, 174 "justification": "Causal claims supported by controlled ablation studies: Tables 4, 5 show single-variable manipulation of M and C0. Learnable vs Point Queries comparison in Table 3." 175 }, 176 "generalization_bounded": { 177 "applies": true, 178 "answer": false, 179 "justification": "Results only on ShapeNet-v2 (synthetic man-made objects) but claims framed broadly as '3D shape encoding and generative modeling' without bounding to this dataset." 180 }, 181 "alternative_explanations_discussed": { 182 "applies": true, 183 "answer": false, 184 "justification": "No discussion of alternative explanations. Does not consider whether improvements stem from increased model capacity, training duration, or other confounds vs the representation design." 185 }, 186 "proxy_outcome_distinction": { 187 "applies": true, 188 "answer": true, 189 "justification": "The paper measures specific metrics (IoU, Chamfer distance, F-Score, FID, KID, FPD, KPD) and frames claims at the same granularity: 'improved performance in 3D shape encoding and generative modeling' as measured by these metrics. No broader framing (e.g., 'visual quality') is claimed beyond what is measured." 190 } 191 }, 192 "setup_transparency": { 193 "model_versions_specified": { 194 "applies": false, 195 "answer": false, 196 "justification": "Trains own neural networks from scratch; does not use pre-trained LLM APIs with versioning concerns." 197 }, 198 "prompts_provided": { 199 "applies": false, 200 "answer": false, 201 "justification": "Does not use prompting. All models trained end-to-end." 202 }, 203 "hyperparameters_reported": { 204 "applies": true, 205 "answer": true, 206 "justification": "Section 7.3: batch sizes (512, 256), learning rates (5e-5, 1e-4), epochs (1600, 8000), warmup + cosine decay, KL weight 0.001, M=512, C=512, C0=32, 18 denoising steps. EDM defaults referenced." 207 }, 208 "scaffolding_described": { 209 "applies": false, 210 "answer": false, 211 "justification": "No agentic scaffolding used. Standard two-stage neural network training pipeline." 212 }, 213 "data_preprocessing_documented": { 214 "applies": true, 215 "answer": true, 216 "justification": "Section 7: shapes → watertight meshes → normalized to bounding box → 500K surface points, 500K occupancy points from volume, 500K near-surface. Rendering and text prompt sources documented." 217 } 218 }, 219 "limitations_and_scope": { 220 "limitations_section_present": { 221 "applies": true, 222 "answer": true, 223 "justification": "Section 8.8 'Limitations' discusses drawbacks of two-stage training strategy." 224 }, 225 "threats_to_validity_specific": { 226 "applies": true, 227 "answer": false, 228 "justification": "Limitations section discusses training cost but not threats to validity of performance claims (e.g., ShapeNet-only evaluation, single-run variance, metric limitations)." 229 }, 230 "scope_boundaries_stated": { 231 "applies": true, 232 "answer": false, 233 "justification": "No explicit statement of what results do NOT show. No bounding of claims to ShapeNet or noting potential non-transferability to real-world data." 234 } 235 }, 236 "data_integrity": { 237 "raw_data_available": { 238 "applies": true, 239 "answer": true, 240 "justification": "ShapeNet-v2 is publicly available for independent verification." 241 }, 242 "data_collection_described": { 243 "applies": true, 244 "answer": true, 245 "justification": "Section 7 describes ShapeNet-v2 with splits from Zhang et al. 2022 and full preprocessing pipeline." 246 }, 247 "recruitment_methods_described": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants. Data is a standard public benchmark (ShapeNet-v2)." 251 }, 252 "data_pipeline_documented": { 253 "applies": true, 254 "answer": true, 255 "justification": "Full pipeline documented: ShapeNet meshes → watertight conversion → normalization → point cloud/occupancy sampling." 256 } 257 }, 258 "conflicts_of_interest": { 259 "funding_disclosed": { 260 "applies": true, 261 "answer": true, 262 "justification": "Acknowledgments: 'supported by the SDAIA-KAUST Center of Excellence in Data Science and AI as well as the ERC Starting Grant Scan2CAD (804724).'" 263 }, 264 "affiliations_disclosed": { 265 "applies": true, 266 "answer": true, 267 "justification": "Author affiliations clearly listed: KAUST and TU Munich. No commercial product evaluated." 268 }, 269 "funder_independent_of_outcome": { 270 "applies": true, 271 "answer": true, 272 "justification": "SDAIA-KAUST AI and ERC are academic/government funders with no financial stake in the results." 273 }, 274 "financial_interests_declared": { 275 "applies": true, 276 "answer": false, 277 "justification": "No competing interests statement found in the paper." 278 } 279 }, 280 "contamination": { 281 "training_cutoff_stated": { 282 "applies": false, 283 "answer": false, 284 "justification": "Trains own models from scratch on ShapeNet. No pre-trained model capability evaluation; contamination not applicable." 285 }, 286 "train_test_overlap_discussed": { 287 "applies": false, 288 "answer": false, 289 "justification": "Same — trains own models on standard splits; pre-training contamination concept not applicable." 290 }, 291 "benchmark_contamination_addressed": { 292 "applies": false, 293 "answer": false, 294 "justification": "No pre-trained model capabilities evaluated. Benchmark contamination not applicable." 295 } 296 }, 297 "human_studies": { 298 "pre_registered": { 299 "applies": false, 300 "answer": false, 301 "justification": "No human participants." 302 }, 303 "irb_or_ethics_approval": { 304 "applies": false, 305 "answer": false, 306 "justification": "No human participants." 307 }, 308 "demographics_reported": { 309 "applies": false, 310 "answer": false, 311 "justification": "No human participants." 312 }, 313 "inclusion_exclusion_criteria": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants." 317 }, 318 "randomization_described": { 319 "applies": false, 320 "answer": false, 321 "justification": "No human participants." 322 }, 323 "blinding_described": { 324 "applies": false, 325 "answer": false, 326 "justification": "No human participants." 327 }, 328 "attrition_reported": { 329 "applies": false, 330 "answer": false, 331 "justification": "No human participants." 332 } 333 }, 334 "cost_and_practicality": { 335 "inference_cost_reported": { 336 "applies": true, 337 "answer": false, 338 "justification": "Only 18 denoising steps mentioned. No wall-clock inference time, latency per shape, or cost per generation reported." 339 }, 340 "compute_budget_stated": { 341 "applies": true, 342 "answer": true, 343 "justification": "Section 7.3: autoencoder trained on 8 A100 for 1600 epochs; diffusion on 4 A100 for 8000 epochs. Hardware and training duration provided." 344 } 345 }, 346 "experimental_rigor": { 347 "seed_sensitivity_reported": { 348 "applies": true, 349 "answer": false, 350 "justification": "No multi-seed experiments. All results appear single-run. Generative metrics (FID, KID) are known to be seed-sensitive." 351 }, 352 "number_of_runs_stated": { 353 "applies": true, 354 "answer": false, 355 "justification": "Number of experimental runs never stated. Results presented without indicating single or multiple runs." 356 }, 357 "hyperparameter_search_budget": { 358 "applies": true, 359 "answer": false, 360 "justification": "Ablation studies explore M and C0 values but no systematic search budget reported. Exploration appears selective." 361 }, 362 "best_config_selection_justified": { 363 "applies": true, 364 "answer": true, 365 "justification": "Tables 4-6 show ablation results for M and C0; best configuration (M=512, C0=32) selected based on reported metrics with transparent criteria." 366 }, 367 "multiple_comparison_correction": { 368 "applies": false, 369 "answer": false, 370 "justification": "No statistical tests performed, so multiple comparison correction not applicable." 371 }, 372 "self_comparison_bias_addressed": { 373 "applies": true, 374 "answer": false, 375 "justification": "Authors re-implement Grid-83 baseline and re-train PVD. No acknowledgment of potential bias from re-implementing competitors." 376 }, 377 "compute_budget_vs_performance": { 378 "applies": true, 379 "answer": false, 380 "justification": "No comparison at matched compute budgets. Proposed method uses 8+4 A100 GPUs but baseline compute requirements not reported for comparison." 381 }, 382 "benchmark_construct_validity": { 383 "applies": true, 384 "answer": false, 385 "justification": "ShapeNet used as sole benchmark without discussing whether it adequately measures 3D shape generation quality. No construct validity discussion." 386 }, 387 "scaffold_confound_addressed": { 388 "applies": false, 389 "answer": false, 390 "justification": "No scaffolding or tool framework is involved. The paper trains and evaluates its own neural networks from scratch using standard training pipelines. Scaffolding confounds are not applicable." 391 } 392 }, 393 "data_leakage": { 394 "temporal_leakage_addressed": { 395 "applies": false, 396 "answer": false, 397 "justification": "Models trained from scratch on ShapeNet with standard splits. No pre-trained model that could have seen test data." 398 }, 399 "feature_leakage_addressed": { 400 "applies": false, 401 "answer": false, 402 "justification": "Standard train/test evaluation; no pre-trained model being probed for knowledge of test data." 403 }, 404 "non_independence_addressed": { 405 "applies": true, 406 "answer": false, 407 "justification": "No discussion of whether ShapeNet train and test splits contain highly similar objects. Uses splits from Zhang et al. 2022 without analyzing potential non-independence." 408 }, 409 "leakage_detection_method": { 410 "applies": false, 411 "answer": false, 412 "justification": "Not applicable for train-from-scratch setup on standard benchmark with defined splits." 413 } 414 } 415 }, 416 "cited_papers": [ 417 { 418 "title": "Denoising Diffusion Probabilistic Models", 419 "authors": [ 420 "Jonathan Ho", 421 "Ajay Jain", 422 "Pieter Abbeel" 423 ], 424 "year": 2020, 425 "relevance": "Foundational diffusion model paper underpinning the 3D generation methodology." 426 }, 427 { 428 "title": "High-resolution image synthesis with latent diffusion models", 429 "authors": [ 430 "Robin Rombach", 431 "Andreas Blattmann", 432 "Dominik Lorenz", 433 "Patrick Esser", 434 "Björn Ommer" 435 ], 436 "year": 2022, 437 "relevance": "Latent diffusion approach directly adapted for 3D shape generation in this work." 438 }, 439 { 440 "title": "Elucidating the Design Space of Diffusion-Based Generative Models", 441 "authors": [ 442 "Tero Karras", 443 "Miika Aittala", 444 "Timo Aila", 445 "Samuli Laine" 446 ], 447 "year": 2022, 448 "relevance": "EDM framework used for training details and denoising objective formulation." 449 }, 450 { 451 "title": "Attention is all you need", 452 "authors": [ 453 "Ashish Vaswani" 454 ], 455 "year": 2017, 456 "relevance": "Transformer attention mechanism core to the proposed latent set representation." 457 }, 458 { 459 "title": "3DILG: Irregular Latent Grids for 3D Generative Modeling", 460 "authors": [ 461 "Biao Zhang", 462 "Matthias Nießner", 463 "Peter Wonka" 464 ], 465 "year": 2022, 466 "relevance": "Directly preceding work from same authors; key baseline comparison." 467 }, 468 { 469 "title": "Neural wavelet-domain diffusion for 3d shape generation", 470 "authors": [ 471 "Ka-Hei Hui", 472 "Ruihui Li", 473 "Jingyu Hu", 474 "Chi-Wing Fu" 475 ], 476 "year": 2022, 477 "relevance": "Key competing method (NeuralWavelet) used as generation baseline." 478 }, 479 { 480 "title": "Occupancy networks: Learning 3d reconstruction in function space", 481 "authors": [ 482 "Lars Mescheder" 483 ], 484 "year": 2019, 485 "relevance": "Foundational neural field method; autoencoding baseline." 486 }, 487 { 488 "title": "Perceiver: General perception with iterative attention", 489 "authors": [ 490 "Andrew Jaegle" 491 ], 492 "year": 2021, 493 "relevance": "Cross-attention architecture for compressing large inputs into fixed-size latent sets; directly inspired encoding design." 494 }, 495 { 496 "title": "BERT: Pre-training of deep bidirectional transformers for language understanding", 497 "authors": [ 498 "Jacob Devlin" 499 ], 500 "year": 2018, 501 "relevance": "Used as text encoder for text-conditioned 3D shape generation experiments." 502 }, 503 { 504 "title": "DiffusionSDF: Conditional Generative Modeling of Signed Distance Functions", 505 "authors": [ 506 "Gene Chou", 507 "Yuval Bahat", 508 "Felix Heide" 509 ], 510 "year": 2022, 511 "arxiv_id": "2211.13757", 512 "relevance": "Concurrent work using diffusion in latent space for neural field generation." 513 }, 514 { 515 "title": "LION: Latent Point Diffusion Models for 3D Shape Generation", 516 "authors": [ 517 "Xiaohui Zeng" 518 ], 519 "year": 2022, 520 "arxiv_id": "2210.06978", 521 "relevance": "Related diffusion model for 3D point cloud generation." 522 } 523 ], 524 "engagement_factors": { 525 "practical_relevance": { 526 "score": 1, 527 "justification": "Useful for 3D graphics researchers but requires significant expertise and compute to adapt to production workflows." 528 }, 529 "surprise_contrarian": { 530 "score": 0, 531 "justification": "Confirms the expected trend that learned latent representations outperform hand-designed ones for 3D generation." 532 }, 533 "fear_safety": { 534 "score": 0, 535 "justification": "No safety, security, or risk implications in 3D shape representation research." 536 }, 537 "drama_conflict": { 538 "score": 0, 539 "justification": "Straightforward incremental improvement over prior methods with no controversy or challenge to industry claims." 540 }, 541 "demo_ability": { 542 "score": 1, 543 "justification": "Code is released but requires multi-GPU training on ShapeNet data, making casual reproduction impractical." 544 }, 545 "brand_recognition": { 546 "score": 1, 547 "justification": "KAUST and TU Munich are recognized in computer vision but are not household names in broader tech audiences." 548 } 549 } 550 }