scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23557B)
      1 {
      2   "scan_version": 3,
      3   "active_modules": [
      4     "experimental_rigor",
      5     "data_leakage"
      6   ],
      7   "paper": {
      8     "title": "3DShape2VecSet: A 3D Shape Representation for Neural Fields and Generative Diffusion Models",
      9     "authors": [
     10       "Biao Zhang",
     11       "Jiapeng Tang",
     12       "Matthias Niessner",
     13       "Peter Wonka"
     14     ],
     15     "year": 2023,
     16     "venue": "ACM Transactions on Graphics (SIGGRAPH)",
     17     "arxiv_id": "2301.11445"
     18   },
     19   "methodology_tags": [
     20     "benchmark-eval"
     21   ],
     22   "claims": [
     23     {
     24       "claim": "3DShape2VecSet achieves state-of-the-art shape autoencoding on ShapeNet, outperforming OccNet, ConvOccNet, IF-Net, and 3DILG across IoU, Chamfer distance, and F-Score.",
     25       "evidence": "Table 3 shows mean IoU 0.965 (Point Queries) vs 0.953 (3DILG), mean Chamfer 0.038 vs 0.040, mean F-Score 0.970 vs 0.966 across all 55 categories.",
     26       "supported": "strong"
     27     },
     28     {
     29       "claim": "The latent set diffusion model achieves state-of-the-art unconditional 3D shape generation on full ShapeNet.",
     30       "evidence": "Table 6 shows Surface-FPD 0.76 (C0=32) vs 1.89 (3DILG), Rendering-FID 17.08 vs 24.83 (3DILG). Table 7 shows large margin over PVD (FPD 0.63 vs 2.33).",
     31       "supported": "strong"
     32     },
     33     {
     34       "claim": "Point Queries outperform Learned Queries for shape encoding.",
     35       "evidence": "Table 3 shows Point Queries consistently better across all 7 categories and all 3 metrics.",
     36       "supported": "strong"
     37     },
     38     {
     39       "claim": "First demonstration of text-conditioned 3D shape generation using diffusion models.",
     40       "evidence": "Section 8.4 with qualitative results in Fig. 11 only. No quantitative metrics provided.",
     41       "supported": "weak"
     42     },
     43     {
     44       "claim": "Category-conditioned generation shows improved recall over competing methods while maintaining comparable precision.",
     45       "evidence": "Table 9: recall 0.86 for chair vs 0.65 (3DILG) and 0.57 (NeuralWavelet); recall 0.89 for table vs 0.59 (3DILG).",
     46       "supported": "moderate"
     47     }
     48   ],
     49   "key_findings": "3DShape2VecSet proposes a novel 3D shape representation encoding neural fields as a fixed-size set of latent vectors without explicit spatial coordinates, processed via cross-attention. On ShapeNet-v2, it achieves state-of-the-art shape autoencoding (IoU 0.965 vs prior 0.953) and unconditional generation (FPD 0.76 vs 1.89). The paper demonstrates multiple conditional generation applications including text-, category-, image-, and point-cloud-conditioned shape generation using latent diffusion.",
     50   "red_flags": [
     51     {
     52       "flag": "No uncertainty quantification",
     53       "detail": "All results across Tables 3-9 are single point estimates with no error bars, standard deviations, or multi-seed results. Impossible to assess whether improvements are within noise."
     54     },
     55     {
     56       "flag": "No statistical significance testing",
     57       "detail": "Comparative claims ('our results are best', 'beat PVD by a large margin') made without any statistical tests."
     58     },
     59     {
     60       "flag": "Text-conditioned generation claim unsupported quantitatively",
     61       "detail": "Section 8.4 claims 'first text-conditioned 3D generation using diffusion models' with only qualitative figure comparisons, no quantitative metrics."
     62     },
     63     {
     64       "flag": "Generalization bounded only to ShapeNet",
     65       "detail": "All experiments on ShapeNet-v2 (synthetic man-made objects) but claims framed broadly without bounding to this dataset."
     66     }
     67   ],
     68   "checklist": {
     69     "artifacts": {
     70       "code_released": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Code URL provided in abstract: 'Code: https://1zb.github.io/3DShape2VecSet/'"
     74       },
     75       "data_released": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Uses publicly available ShapeNet-v2, 3D-R2N2 renderings, and ShapeGlot text prompts. All standard public benchmarks."
     79       },
     80       "environment_specified": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Hardware mentioned (8 A100 for autoencoder, 4 A100 for diffusion) but no requirements.txt, Dockerfile, or library version specifications provided in the paper."
     84       },
     85       "reproduction_instructions": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No step-by-step reproduction instructions in the paper. Implementation details in Section 7.3 but no runnable commands or README-style instructions."
     89       }
     90     },
     91     "statistical_methodology": {
     92       "confidence_intervals_or_error_bars": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "All tables (3-9) report only point estimates. No confidence intervals, error bars, or ± notation found."
     96       },
     97       "significance_tests": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "Claims of improvement are based solely on comparing numbers without any statistical significance tests."
    101       },
    102       "effect_sizes_reported": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Tables provide absolute metric values for both proposed method and baselines (e.g., IoU 0.965 vs 0.953, FPD 0.76 vs 1.89), allowing readers to assess magnitude of improvement in context."
    106       },
    107       "sample_size_justified": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "No justification for dataset size or number of generated samples used for evaluation metrics."
    111       },
    112       "variance_reported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "All results appear to be from single runs. No standard deviations, variance across seeds, or multiple-run results reported."
    116       }
    117     },
    118     "evaluation_design": {
    119       "baselines_included": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Multiple baselines: OccNet, ConvOccNet, IF-Net, 3DILG for autoencoding; PVD, 3DILG, NeuralWavelet, Grid-83, 3DShapeGen, AutoSDF for generation (Section 7.1)."
    123       },
    124       "baselines_contemporary": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Baselines include 3DILG (2022), NeuralWavelet (2022), PVD (2021) — all contemporary for a 2023 paper."
    128       },
    129       "ablation_study": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Table 4 ablates M (512, 256, 128, 64); Table 5 ablates C0 (1-64); Table 3 compares Learned vs Point Queries."
    133       },
    134       "multiple_metrics": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Autoencoding: IoU, Chamfer, F-Score. Generation: FPD, KPD, FID, KID, Precision, Recall, MMD-CD, MMD-EMD, COV-CD, COV-EMD."
    138       },
    139       "human_evaluation": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No human evaluation of generated shape quality. For generative modeling, human perceptual evaluation is relevant but was not included."
    143       },
    144       "held_out_test_set": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Section 7 states 'We use the training/val splits in [Zhang et al. 2022].' Section 8.1 references 'test split.'"
    148       },
    149       "per_category_breakdown": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Table 3 shows per-category results for 7 largest ShapeNet categories plus overall mean. Tables 8-9 show per-category generation results."
    153       },
    154       "failure_cases_discussed": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 4 states 'We initially explored many variations... Ultimately, we could not improve on existing irregular grids.' Section 8.8 discusses limitations of two-stage training."
    158       },
    159       "negative_results_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "C0=64 gives worse generation results than C0=32 (Table 6). Section 4 reports that tri-planes, frequency compositions, and factored representations failed to improve over irregular grids."
    163       }
    164     },
    165     "claims_and_evidence": {
    166       "abstract_claims_supported": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Abstract claims of 'improved performance in 3D shape encoding and generative modeling' supported by Tables 3-9. All claimed applications demonstrated."
    170       },
    171       "causal_claims_justified": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Causal claims supported by controlled ablation studies: Tables 4, 5 show single-variable manipulation of M and C0. Learnable vs Point Queries comparison in Table 3."
    175       },
    176       "generalization_bounded": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "Results only on ShapeNet-v2 (synthetic man-made objects) but claims framed broadly as '3D shape encoding and generative modeling' without bounding to this dataset."
    180       },
    181       "alternative_explanations_discussed": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No discussion of alternative explanations. Does not consider whether improvements stem from increased model capacity, training duration, or other confounds vs the representation design."
    185       },
    186       "proxy_outcome_distinction": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "The paper measures specific metrics (IoU, Chamfer distance, F-Score, FID, KID, FPD, KPD) and frames claims at the same granularity: 'improved performance in 3D shape encoding and generative modeling' as measured by these metrics. No broader framing (e.g., 'visual quality') is claimed beyond what is measured."
    190       }
    191     },
    192     "setup_transparency": {
    193       "model_versions_specified": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "Trains own neural networks from scratch; does not use pre-trained LLM APIs with versioning concerns."
    197       },
    198       "prompts_provided": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "Does not use prompting. All models trained end-to-end."
    202       },
    203       "hyperparameters_reported": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Section 7.3: batch sizes (512, 256), learning rates (5e-5, 1e-4), epochs (1600, 8000), warmup + cosine decay, KL weight 0.001, M=512, C=512, C0=32, 18 denoising steps. EDM defaults referenced."
    207       },
    208       "scaffolding_described": {
    209         "applies": false,
    210         "answer": false,
    211         "justification": "No agentic scaffolding used. Standard two-stage neural network training pipeline."
    212       },
    213       "data_preprocessing_documented": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Section 7: shapes → watertight meshes → normalized to bounding box → 500K surface points, 500K occupancy points from volume, 500K near-surface. Rendering and text prompt sources documented."
    217       }
    218     },
    219     "limitations_and_scope": {
    220       "limitations_section_present": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Section 8.8 'Limitations' discusses drawbacks of two-stage training strategy."
    224       },
    225       "threats_to_validity_specific": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "Limitations section discusses training cost but not threats to validity of performance claims (e.g., ShapeNet-only evaluation, single-run variance, metric limitations)."
    229       },
    230       "scope_boundaries_stated": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No explicit statement of what results do NOT show. No bounding of claims to ShapeNet or noting potential non-transferability to real-world data."
    234       }
    235     },
    236     "data_integrity": {
    237       "raw_data_available": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "ShapeNet-v2 is publicly available for independent verification."
    241       },
    242       "data_collection_described": {
    243         "applies": true,
    244         "answer": true,
    245         "justification": "Section 7 describes ShapeNet-v2 with splits from Zhang et al. 2022 and full preprocessing pipeline."
    246       },
    247       "recruitment_methods_described": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants. Data is a standard public benchmark (ShapeNet-v2)."
    251       },
    252       "data_pipeline_documented": {
    253         "applies": true,
    254         "answer": true,
    255         "justification": "Full pipeline documented: ShapeNet meshes → watertight conversion → normalization → point cloud/occupancy sampling."
    256       }
    257     },
    258     "conflicts_of_interest": {
    259       "funding_disclosed": {
    260         "applies": true,
    261         "answer": true,
    262         "justification": "Acknowledgments: 'supported by the SDAIA-KAUST Center of Excellence in Data Science and AI as well as the ERC Starting Grant Scan2CAD (804724).'"
    263       },
    264       "affiliations_disclosed": {
    265         "applies": true,
    266         "answer": true,
    267         "justification": "Author affiliations clearly listed: KAUST and TU Munich. No commercial product evaluated."
    268       },
    269       "funder_independent_of_outcome": {
    270         "applies": true,
    271         "answer": true,
    272         "justification": "SDAIA-KAUST AI and ERC are academic/government funders with no financial stake in the results."
    273       },
    274       "financial_interests_declared": {
    275         "applies": true,
    276         "answer": false,
    277         "justification": "No competing interests statement found in the paper."
    278       }
    279     },
    280     "contamination": {
    281       "training_cutoff_stated": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "Trains own models from scratch on ShapeNet. No pre-trained model capability evaluation; contamination not applicable."
    285       },
    286       "train_test_overlap_discussed": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "Same — trains own models on standard splits; pre-training contamination concept not applicable."
    290       },
    291       "benchmark_contamination_addressed": {
    292         "applies": false,
    293         "answer": false,
    294         "justification": "No pre-trained model capabilities evaluated. Benchmark contamination not applicable."
    295       }
    296     },
    297     "human_studies": {
    298       "pre_registered": {
    299         "applies": false,
    300         "answer": false,
    301         "justification": "No human participants."
    302       },
    303       "irb_or_ethics_approval": {
    304         "applies": false,
    305         "answer": false,
    306         "justification": "No human participants."
    307       },
    308       "demographics_reported": {
    309         "applies": false,
    310         "answer": false,
    311         "justification": "No human participants."
    312       },
    313       "inclusion_exclusion_criteria": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No human participants."
    317       },
    318       "randomization_described": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No human participants."
    322       },
    323       "blinding_described": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "No human participants."
    327       },
    328       "attrition_reported": {
    329         "applies": false,
    330         "answer": false,
    331         "justification": "No human participants."
    332       }
    333     },
    334     "cost_and_practicality": {
    335       "inference_cost_reported": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "Only 18 denoising steps mentioned. No wall-clock inference time, latency per shape, or cost per generation reported."
    339       },
    340       "compute_budget_stated": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "Section 7.3: autoencoder trained on 8 A100 for 1600 epochs; diffusion on 4 A100 for 8000 epochs. Hardware and training duration provided."
    344       }
    345     },
    346     "experimental_rigor": {
    347       "seed_sensitivity_reported": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "No multi-seed experiments. All results appear single-run. Generative metrics (FID, KID) are known to be seed-sensitive."
    351       },
    352       "number_of_runs_stated": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "Number of experimental runs never stated. Results presented without indicating single or multiple runs."
    356       },
    357       "hyperparameter_search_budget": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "Ablation studies explore M and C0 values but no systematic search budget reported. Exploration appears selective."
    361       },
    362       "best_config_selection_justified": {
    363         "applies": true,
    364         "answer": true,
    365         "justification": "Tables 4-6 show ablation results for M and C0; best configuration (M=512, C0=32) selected based on reported metrics with transparent criteria."
    366       },
    367       "multiple_comparison_correction": {
    368         "applies": false,
    369         "answer": false,
    370         "justification": "No statistical tests performed, so multiple comparison correction not applicable."
    371       },
    372       "self_comparison_bias_addressed": {
    373         "applies": true,
    374         "answer": false,
    375         "justification": "Authors re-implement Grid-83 baseline and re-train PVD. No acknowledgment of potential bias from re-implementing competitors."
    376       },
    377       "compute_budget_vs_performance": {
    378         "applies": true,
    379         "answer": false,
    380         "justification": "No comparison at matched compute budgets. Proposed method uses 8+4 A100 GPUs but baseline compute requirements not reported for comparison."
    381       },
    382       "benchmark_construct_validity": {
    383         "applies": true,
    384         "answer": false,
    385         "justification": "ShapeNet used as sole benchmark without discussing whether it adequately measures 3D shape generation quality. No construct validity discussion."
    386       },
    387       "scaffold_confound_addressed": {
    388         "applies": false,
    389         "answer": false,
    390         "justification": "No scaffolding or tool framework is involved. The paper trains and evaluates its own neural networks from scratch using standard training pipelines. Scaffolding confounds are not applicable."
    391       }
    392     },
    393     "data_leakage": {
    394       "temporal_leakage_addressed": {
    395         "applies": false,
    396         "answer": false,
    397         "justification": "Models trained from scratch on ShapeNet with standard splits. No pre-trained model that could have seen test data."
    398       },
    399       "feature_leakage_addressed": {
    400         "applies": false,
    401         "answer": false,
    402         "justification": "Standard train/test evaluation; no pre-trained model being probed for knowledge of test data."
    403       },
    404       "non_independence_addressed": {
    405         "applies": true,
    406         "answer": false,
    407         "justification": "No discussion of whether ShapeNet train and test splits contain highly similar objects. Uses splits from Zhang et al. 2022 without analyzing potential non-independence."
    408       },
    409       "leakage_detection_method": {
    410         "applies": false,
    411         "answer": false,
    412         "justification": "Not applicable for train-from-scratch setup on standard benchmark with defined splits."
    413       }
    414     }
    415   },
    416   "cited_papers": [
    417     {
    418       "title": "Denoising Diffusion Probabilistic Models",
    419       "authors": [
    420         "Jonathan Ho",
    421         "Ajay Jain",
    422         "Pieter Abbeel"
    423       ],
    424       "year": 2020,
    425       "relevance": "Foundational diffusion model paper underpinning the 3D generation methodology."
    426     },
    427     {
    428       "title": "High-resolution image synthesis with latent diffusion models",
    429       "authors": [
    430         "Robin Rombach",
    431         "Andreas Blattmann",
    432         "Dominik Lorenz",
    433         "Patrick Esser",
    434         "Björn Ommer"
    435       ],
    436       "year": 2022,
    437       "relevance": "Latent diffusion approach directly adapted for 3D shape generation in this work."
    438     },
    439     {
    440       "title": "Elucidating the Design Space of Diffusion-Based Generative Models",
    441       "authors": [
    442         "Tero Karras",
    443         "Miika Aittala",
    444         "Timo Aila",
    445         "Samuli Laine"
    446       ],
    447       "year": 2022,
    448       "relevance": "EDM framework used for training details and denoising objective formulation."
    449     },
    450     {
    451       "title": "Attention is all you need",
    452       "authors": [
    453         "Ashish Vaswani"
    454       ],
    455       "year": 2017,
    456       "relevance": "Transformer attention mechanism core to the proposed latent set representation."
    457     },
    458     {
    459       "title": "3DILG: Irregular Latent Grids for 3D Generative Modeling",
    460       "authors": [
    461         "Biao Zhang",
    462         "Matthias Nießner",
    463         "Peter Wonka"
    464       ],
    465       "year": 2022,
    466       "relevance": "Directly preceding work from same authors; key baseline comparison."
    467     },
    468     {
    469       "title": "Neural wavelet-domain diffusion for 3d shape generation",
    470       "authors": [
    471         "Ka-Hei Hui",
    472         "Ruihui Li",
    473         "Jingyu Hu",
    474         "Chi-Wing Fu"
    475       ],
    476       "year": 2022,
    477       "relevance": "Key competing method (NeuralWavelet) used as generation baseline."
    478     },
    479     {
    480       "title": "Occupancy networks: Learning 3d reconstruction in function space",
    481       "authors": [
    482         "Lars Mescheder"
    483       ],
    484       "year": 2019,
    485       "relevance": "Foundational neural field method; autoencoding baseline."
    486     },
    487     {
    488       "title": "Perceiver: General perception with iterative attention",
    489       "authors": [
    490         "Andrew Jaegle"
    491       ],
    492       "year": 2021,
    493       "relevance": "Cross-attention architecture for compressing large inputs into fixed-size latent sets; directly inspired encoding design."
    494     },
    495     {
    496       "title": "BERT: Pre-training of deep bidirectional transformers for language understanding",
    497       "authors": [
    498         "Jacob Devlin"
    499       ],
    500       "year": 2018,
    501       "relevance": "Used as text encoder for text-conditioned 3D shape generation experiments."
    502     },
    503     {
    504       "title": "DiffusionSDF: Conditional Generative Modeling of Signed Distance Functions",
    505       "authors": [
    506         "Gene Chou",
    507         "Yuval Bahat",
    508         "Felix Heide"
    509       ],
    510       "year": 2022,
    511       "arxiv_id": "2211.13757",
    512       "relevance": "Concurrent work using diffusion in latent space for neural field generation."
    513     },
    514     {
    515       "title": "LION: Latent Point Diffusion Models for 3D Shape Generation",
    516       "authors": [
    517         "Xiaohui Zeng"
    518       ],
    519       "year": 2022,
    520       "arxiv_id": "2210.06978",
    521       "relevance": "Related diffusion model for 3D point cloud generation."
    522     }
    523   ],
    524   "engagement_factors": {
    525     "practical_relevance": {
    526       "score": 1,
    527       "justification": "Useful for 3D graphics researchers but requires significant expertise and compute to adapt to production workflows."
    528     },
    529     "surprise_contrarian": {
    530       "score": 0,
    531       "justification": "Confirms the expected trend that learned latent representations outperform hand-designed ones for 3D generation."
    532     },
    533     "fear_safety": {
    534       "score": 0,
    535       "justification": "No safety, security, or risk implications in 3D shape representation research."
    536     },
    537     "drama_conflict": {
    538       "score": 0,
    539       "justification": "Straightforward incremental improvement over prior methods with no controversy or challenge to industry claims."
    540     },
    541     "demo_ability": {
    542       "score": 1,
    543       "justification": "Code is released but requires multi-GPU training on ShapeNet data, making casual reproduction impractical."
    544     },
    545     "brand_recognition": {
    546       "score": 1,
    547       "justification": "KAUST and TU Munich are recognized in computer vision but are not household names in broader tech audiences."
    548     }
    549   }
    550 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs