ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (23373B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "3DShape2VecSet: A 3D Shape Representation for Neural Fields and Generative Diffusion Models",
      6     "authors": [
      7       "Biao Zhang",
      8       "Jiapeng Tang",
      9       "Matthias Nießner",
     10       "Peter Wonka"
     11     ],
     12     "year": 2023,
     13     "venue": "ACM Transactions on Graphics",
     14     "arxiv_id": "2301.11445",
     15     "doi": "10.1145/3592442"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All abstract claims (improved encoding quality, multiple generative applications) are backed by quantitative results in Tables 3–9 and qualitative figures throughout.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Ablation studies on M (number of latents) and C0 (compression channels) in Tables 4–5 directly support causal design claims; cross-attention vs. KNN encoding is also compared.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "Claims of 'state of the art in 3D shape encoding and generative modeling' are made broadly, but evaluation is entirely on ShapeNet-v2; no cross-dataset or cross-domain validation is performed.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper does not discuss whether performance gains could stem from larger parameter counts, more training compute, or dataset-specific characteristics rather than the proposed architectural innovation.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper explicitly notes that rendering-based FID/KID are imperfect for 3D quality and introduces 3D-based FPD/KPD metrics to compensate, clearly distinguishing what each metric measures.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 8.8 is a dedicated 'Limitations' subsection discussing the two-stage training requirement and training time costs.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The limitations focus on computational cost and retraining requirements, not on threats to validity such as dataset bias, metric limitations, or whether improvements hold across domains.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper does not explicitly state what the results do NOT show; no claims are bounded to ShapeNet-only conclusions or specific shape categories.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Acknowledgements state support from SDAIA-KAUST Center of Excellence in Data Science and Artificial Intelligence and ERC Starting Grant Scan2CAD (804724).",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations (KAUST, TU Munich) are disclosed in the header and author addresses.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "SDAIA-KAUST AI and ERC are academic/government research funders with no direct commercial stake in the proposed representation.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interests declaration is present in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Neural fields, latent sets, cross-attention, and the proposed VecSet representation are formally defined with equations in Sections 3–5.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Five numbered contributions are explicitly listed at the end of the introduction, covering representation, architecture, autoencoding, generation, and applications.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 provides a detailed taxonomy of prior methods (Table 1, Table 2) and the paper explicitly distinguishes its approach from 3DILG, ConvOccNet, and NeuralWavelet in both framing and evaluation.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "The abstract directly links to 'Code: https://1zb.github.io/3DShape2VecSet/' indicating code is available at a project/repository page.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "ShapeNet-v2 is a publicly available benchmark dataset used without modification as the primary data source.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Training hardware (8 A100 GPUs) is mentioned but no requirements.txt, Dockerfile, or dependency specification is provided.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Training hyperparameters are reported but no step-by-step reproduction guide is provided that would allow following without guessing or significant inference.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "All results in Tables 3–9 are single point estimates with no confidence intervals or error bars reported.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests are used for any of the comparative claims against baselines.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Numerical improvements are shown with baseline context (e.g., FPD 1.89→0.76 vs 3DILG, IoU 0.953→0.965 mean all categories), providing readable effect sizes.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "ShapeNet dataset size is not justified; the choice of 55 categories or specific test splits is not discussed in terms of statistical power.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "No variance, standard deviation, or results across multiple runs are reported for any experiment.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Multiple baselines included: OccNet, ConvOccNet, IF-Net, 3DILG for autoencoding; PVD, 3DILG, NeuralWavelet for generation.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "3DILG (NeurIPS 2022), NeuralWavelet (SIGGRAPH Asia 2022), and PVD (ICCV 2021) are recent and competitive baselines appropriate for a 2023 paper.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Tables 4 and 5 provide ablations on M (number of latents: 64–512) and C0 (compression channels: 1–64), and Sec. 5.1 compares learned vs. point queries.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Autoencoding uses IoU, Chamfer distance, and F-score; generation uses FPD, KPD, Rendering-FID, Rendering-KID, Precision, Recall, MMD-CD, MMD-EMD, COV-CD, COV-EMD.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "Human evaluation is not standard practice for 3D shape reconstruction/generation benchmarks and is not included.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "The paper uses train/val splits from Zhang et al. 2022 and evaluates on held-out test shapes, including novel shape retrieval analysis in Sec. 8.7.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Table 3 shows per-category results for 7 largest ShapeNet categories; Table 8 shows category-conditioned generation for airplane, chair, table, car, sofa.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": false,
    223           "justification": "Section 8.8 Limitations discusses training cost but shows no failure case examples or systematic analysis of where the method fails.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Ablation Tables 4–5 explicitly show performance degradation with smaller M and C0, and Table 6 shows C0=64 performs worse than C0=32 for generation.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "BERT and ResNet-18 are referenced without specific version or checkpoint dates; EDM training follows 'default settings' without fully specifying which configuration.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": false,
    242           "answer": false,
    243           "justification": "This is a 3D shape generation paper; no language model prompts or system instructions are used.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Learning rates (5e-5, 1e-4), batch sizes (512, 256), epochs (1600, 8000), warmup, KL weight (0.001), M=512, C0=32, and 18 denoising steps are all reported.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "No agentic scaffolding is involved; this is a supervised deep learning paper.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Section 7 describes converting shapes to watertight meshes, normalizing to bounding box, sampling 500K surface points, query point sampling strategy, and rendering setup.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "ShapeNet-v2 is publicly available (with registration) and the same public splits from Zhang et al. 2022 are used.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "The preprocessing pipeline is clearly described; the original ShapeNet data collection is documented in the referenced Chang et al. 2015 paper.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "Standard public benchmark; no participant recruitment involved.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "Full pipeline from ShapeNet mesh → watertight mesh → normalized mesh → point cloud sampling → query point sampling for occupancy is described in Section 7.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": false,
    294           "answer": false,
    295           "justification": "The paper trains its own models from scratch on ShapeNet; training cutoff contamination is not applicable.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": false,
    300           "answer": false,
    301           "justification": "Not evaluating a pre-trained language/foundation model on external benchmarks; NA.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": false,
    306           "answer": false,
    307           "justification": "The models are trained from scratch on ShapeNet splits, not pre-trained large models being evaluated on unseen benchmarks.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants in the study.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants in the study.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in the study.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in the study.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in the study.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in the study.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in the study.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "18 denoising steps are mentioned but no latency, memory, or per-shape inference cost is reported.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Hardware (8 A100 for autoencoder, 4 A100 for diffusion) and epochs are stated but total GPU-hours or compute cost are not quantified.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "3DShape2VecSet achieves state-of-the-art 3D shape autoencoding on ShapeNet with IoU of 0.965 (all categories), outperforming 3DILG (0.953).",
    374       "evidence": "Table 3 shows quantitative comparison across IoU, Chamfer distance, and F-score on 7 categories and all 55 categories against OccNet, ConvOccNet, IF-Net, and 3DILG.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "The latent set diffusion model achieves state-of-the-art unconditional 3D shape generation with Surface-FPD of 0.76, versus 1.89 for 3DILG.",
    379       "evidence": "Table 6 compares FPD, KPD, Rendering-FID, and Rendering-KID across Grid-83, 3DILG, and the proposed method at different C0 values.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Point queries (subsampled point cloud) outperform learnable queries for shape encoding across all categories.",
    384       "evidence": "Table 3 consistently shows Point Queries column outperforming Learned Queries in IoU, Chamfer, and F-score for all 7 reported categories.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "The proposed method demonstrates the first text-conditioned 3D shape generation using diffusion models.",
    389       "evidence": "Section 8.4 states 'the first demonstration of text-conditioned 3D shape generation using diffusion models' with qualitative results in Fig. 11; no quantitative baseline exists.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Aggressive KL compression (C0=32) achieves nearly identical reconstruction quality to C0=64 while enabling easier diffusion model training.",
    394       "evidence": "Table 5 shows IoU 0.963 vs 0.964 for C0=32 vs C0=64, and Table 6 shows generation quality peaks at C0=32.",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "Category-conditioned generation achieves significantly better recall than NeuralWavelet (0.86 vs 0.57 for chair).",
    399       "evidence": "Table 9 shows Recall comparison: Ours 0.86, NW 0.57 for chair; Ours 0.89, NW 0.68 for table.",
    400       "supported": "strong"
    401     }
    402   ],
    403   "methodology_tags": [
    404     "benchmark-eval"
    405   ],
    406   "key_findings": "3DShape2VecSet proposes encoding 3D shapes as unordered sets of latent vectors without explicit spatial coordinates, using cross-attention as a learnable interpolation mechanism. This representation achieves state-of-the-art reconstruction (IoU 0.965 on ShapeNet) and generation quality (FPD 0.76 vs 1.89 for prior best), demonstrating that eliminating explicit positional coordinates and leveraging transformer-native set representations improves both encoding fidelity and generative modeling. The two-stage training (VAE + diffusion) with aggressive latent compression (C0=32 recommended) enables five conditional generation tasks while maintaining strong reconstruction quality.",
    407   "red_flags": [
    408     {
    409       "flag": "No statistical testing",
    410       "detail": "All comparative claims lack significance tests or confidence intervals; improvements over baselines are reported as single point estimates only."
    411     },
    412     {
    413       "flag": "Single dataset evaluation",
    414       "detail": "All experiments are conducted on ShapeNet-v2 only; no cross-dataset or out-of-distribution evaluation is performed despite broad SOTA claims."
    415     },
    416     {
    417       "flag": "Text conditioning not quantitatively evaluated",
    418       "detail": "The text-conditioned generation claim is supported only by qualitative figures (Fig. 11) with no quantitative metrics, yet the paper claims it as a novel first."
    419     },
    420     {
    421       "flag": "Proxy metric concerns not fully resolved",
    422       "detail": "Rendering-based FID/KID are acknowledged to be imperfect for 3D quality; while FPD/KPD are introduced, the PointNet++ feature extractor quality is itself not validated."
    423     },
    424     {
    425       "flag": "No variance across runs",
    426       "detail": "Training diffusion models is stochastic; no variance across random seeds or runs is reported for any generation metric."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "3DILG: Irregular Latent Grids for 3D Generative Modeling",
    432       "relevance": "Primary baseline and predecessor using irregular latent grids with autoregressive generation; the proposed method directly extends and improves upon this approach."
    433     },
    434     {
    435       "title": "Neural Wavelet-Domain Diffusion for 3D Shape Generation",
    436       "relevance": "Key competitor using diffusion models in wavelet frequency domain for 3D shape generation; compared in category-conditioned generation experiments."
    437     },
    438     {
    439       "title": "High-Resolution Image Synthesis with Latent Diffusion Models",
    440       "relevance": "Foundation for the two-stage latent diffusion approach adopted in this paper."
    441     },
    442     {
    443       "title": "Elucidating the Design Space of Diffusion-Based Generative Models (EDM)",
    444       "relevance": "Training framework and hyperparameters directly adopted for the diffusion model stage."
    445     },
    446     {
    447       "title": "ShapeNet: An Information-Rich 3D Model Repository",
    448       "relevance": "Primary benchmark dataset used for all experiments."
    449     },
    450     {
    451       "title": "Convolutional Occupancy Networks",
    452       "relevance": "Baseline using regular grid latents for neural field shape representation."
    453     },
    454     {
    455       "title": "Occupancy Networks: Learning 3D Reconstruction in Function Space",
    456       "relevance": "Foundational neural field method with global latent; used as baseline and motivating comparison."
    457     },
    458     {
    459       "title": "Attention Is All You Need",
    460       "relevance": "Transformer architecture underpinning the cross-attention and self-attention mechanisms central to the proposed representation."
    461     }
    462   ],
    463   "engagement_factors": {
    464     "practical_relevance": {
    465       "score": 2,
    466       "justification": "The method enables multiple practical 3D content creation applications (text-to-3D, image-to-3D, shape completion) with released code, but requires 8 A100 GPUs to train."
    467     },
    468     "surprise_contrarian": {
    469       "score": 1,
    470       "justification": "The insight that removing explicit spatial coordinates from latent vectors improves performance is counter-intuitive but not dramatically surprising given broader attention literature trends."
    471     },
    472     "fear_safety": {
    473       "score": 0,
    474       "justification": "No AI risk or safety concerns; purely a 3D representation learning paper."
    475     },
    476     "drama_conflict": {
    477       "score": 0,
    478       "justification": "Standard academic benchmark competition with no controversy or adversarial framing."
    479     },
    480     "demo_ability": {
    481       "score": 2,
    482       "justification": "Code is released at the project page and the method supports interactive applications like text-to-3D and image-to-3D that are demonstrable."
    483     },
    484     "brand_recognition": {
    485       "score": 1,
    486       "justification": "KAUST and TU Munich (with Nießner, known for 3D vision work) are respected but not high-profile AI labs on par with DeepMind or OpenAI."
    487     }
    488   },
    489   "hn_data": {
    490     "threads": [
    491       {
    492         "hn_id": "47334694",
    493         "title": "BitNet: Inference framework for 1-bit LLMs",
    494         "points": 370,
    495         "comments": 169,
    496         "url": "https://news.ycombinator.com/item?id=47334694",
    497         "created_at": "2026-03-11T12:27:15Z"
    498       }
    499     ],
    500     "top_points": 370,
    501     "total_points": 370,
    502     "total_comments": 169
    503   }
    504 }

Impressum · Datenschutz