scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (17557B)
      1 {
      2   "paper": {
      3     "title": "ConCo: Optimizing Compilation of Concurrent Tensor Programs on Shared GPU",
      4     "authors": ["Jiamin Lu", "Jingwei Sun", "Yunlong Xu", "Peng Sun", "Guangzhong Sun"],
      5     "year": 2025,
      6     "venue": "ICS '25 (International Conference on Supercomputing)",
      7     "doi": "10.1145/3721145.3730433"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper uses operators from the ROLLER benchmark collection (publicly available), but does not release their own experimental data, configurations, or scripts."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions GPU types (RTX 2080 Ti, A100 80 GB) and CPUs but does not provide software versions, library versions, requirements files, or environment setup details beyond stating TVM/Ansor is used."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No reproduction instructions, README, or scripts are provided."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results are reported as point estimates (e.g., 'up to 69.85%', '1.2x') with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims ConCo outperforms baselines but provides no statistical significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports percentage improvements with baseline context throughout, e.g., JCT reduction of up to 69.85% compared to Baseline 1, throughput improvement of up to 1.2x, compilation time reduced by more than 2x. Tables 2-4 provide absolute latency values."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for why these particular models, operators, or number of experimental configurations were chosen. The number of operator pairs (500 repetitions) is stated but not justified."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. Results appear to be single-run or unreported aggregation."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Two baselines are defined: Baseline1 (ConCo 100% code under default MPS) and Baseline2 (ConCo 100% code with optimal resource allocation search). Ansor is also compared for compilation time."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Ansor (2020) and VELTAIR (2022) are discussed as the most relevant prior work. The paper argues VELTAIR cannot be applied to GPUs, and Ansor is the state-of-the-art DNN compiler used as baseline."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 4.6 (Features Analysis) analyzes what makes ConCo-generated code different. The evaluation separates single-operator (Section 4.4) from end-to-end (Section 4.5) scenarios, and the compilation speedup is evaluated separately (Section 4.2)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics are used: JCT reduction, normalized throughput, inference latency, and compilation time."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Human evaluation is irrelevant for a systems/compiler optimization paper measuring throughput and latency."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is a systems paper, not a machine learning model evaluation. There is no train/test split concept applicable."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down per model (ResNet-18, VGG-19, BERT-small, DLRM), per GPU (2080 Ti, A100), per operator pair (Table 2), and per resource constraint level (Tables 3-4)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4.5 discusses why end-to-end gains are smaller than single-operator gains, attributing it to the inability to control GPU operator scheduling. This is an honest discussion of where the approach is limited."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that end-to-end performance improvements are inferior to single-operator improvements (Section 4.5) and explains why. It also shows VELTAIR's approach fails on GPUs (Section 2.4, Figure 3)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims of 1.2x throughput improvement, 69.85% JCT reduction, and 2x compilation speedup are all supported by results in Sections 4.2-4.5 and corresponding figures/tables."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes causal claims (ConCo improves throughput) supported by controlled experiments comparing code variants on the same hardware under the same workloads. Section 4.6 provides a feature analysis explaining why ConCo's code performs better."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Claims are bounded to the tested GPUs (2080 Ti, A100), tested models (ResNet-18, VGG-19, BERT-small, DLRM), and MPS-based sharing. The conclusion explicitly states future work will extend to more hardware architectures."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for the observed improvements, such as whether the gains are specific to MPS or would hold under other sharing mechanisms, or whether Ansor's cost model could be retrained to achieve similar results."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "This paper does not use LLMs or pre-trained models via API. The DNN models evaluated (ResNet-18, VGG-19, BERT-small, DLRM) are standard architectures compiled with TVM."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No prompting is used in this paper."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Compilation hyperparameters are reported: measuring_batch_size=64, N=len(tasks), M=5, alpha=2.5% (Section 4.2). Ansor configuration follows TVM docs (900 x len(tasks) trials). Operator configurations in Table 1."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The paper describes how operators were selected from the ROLLER benchmark (Section 2.3), how batch sizes were adjusted (Table 1 note), and the experimental setup including concurrency levels (Table 5)."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion briefly mentions future work but does not discuss limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity are discussed."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. Scope boundaries are only implicit (tested on 2 GPUs, 4 models)."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw experimental data (latency measurements, compilation logs) is made available."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The measurement procedure is described: operators run 500 times each (Section 4.4), compilation follows Ansor's process with specified hyperparameters, and the scheduling model is described (Section 4.5)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data comes from standard benchmarks and compiler-generated code."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The compilation pipeline (Figure 6) and evaluation pipeline are documented: models are partitioned into subgraphs, code variants generated across resource levels, then evaluated in concurrent scenarios."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: University of Science and Technology of China and Independent Researchers."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence cannot be assessed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This paper does not evaluate a pre-trained model's capability on a benchmark. It is a systems paper about compiler optimization."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Not applicable — no pre-trained model benchmark evaluation."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not applicable — no pre-trained model benchmark evaluation."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Inference latency is extensively reported across resource configurations (Tables 3-4), and compilation time is quantified in hours (Figure 8)."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Compilation times are reported in detail: e.g., ResNet-18 takes 129.3 hours with ConCo vs 301.3 hours with Ansor on 2080 Ti (Figure 8). Hardware is specified (2080 Ti, A100 80 GB)."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "ConCo improves inference throughput by up to 1.2x compared to existing solutions in concurrent DNN inference on shared GPUs.",
    286       "evidence": "Figures 9 and 10 show normalized throughput across 24 task configurations on 2080 Ti (up to 1.21x) and A100 (up to 1.18x) compared to Baseline2.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "ConCo reduces job completion time by up to 69.85% for concurrent single-operator processes compared to Baseline 1.",
    291       "evidence": "Table 2 shows JCT reductions for operator pairs, with the maximum 69.85% reduction on A100 for ResNet-18 22nd OP vs BERT-small 6th OP.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "ConCo's optimal-code-sharing strategy achieves more than 2x compilation speedup over standard Ansor.",
    296       "evidence": "Figure 8 shows compilation times: e.g., ResNet-18 on 2080 Ti takes 129.3h (ConCo) vs 301.3h (Ansor). Section 4.2 states ConCo requires only 14.3%-32.3% of Ansor's time to achieve equivalent quality.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Code that performs optimally in exclusive GPU scenarios is suboptimal in concurrent scenarios.",
    301       "evidence": "Figure 2 demonstrates that suboptimal exclusive codes can reduce concurrent completion time by up to 26.2% across 5 operator pairs (Section 2.3).",
    302       "supported": "strong"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "ConCo is a concurrency-aware DNN compilation framework that generates multiple code variants optimized for different GPU resource constraints and selects optimal variants at runtime. The key insight is that code optimized for exclusive GPU use is suboptimal under concurrent execution due to resource contention. ConCo achieves up to 1.2x throughput improvement and 69.85% JCT reduction for concurrent inference, while reducing compilation time by over 2x through an optimal-code-sharing strategy that exploits performance correlations across resource configurations.",
    307   "red_flags": [
    308     {
    309       "flag": "No variance or error bars reported",
    310       "detail": "All results are point estimates with no indication of measurement variance across runs. For latency measurements on GPUs, variance can be significant due to thermal throttling, scheduling jitter, and other factors."
    311     },
    312     {
    313       "flag": "No limitations section",
    314       "detail": "The paper lacks any discussion of limitations or threats to validity. Key limitations (MPS-only, limited model diversity, no real production workload evaluation) are not acknowledged."
    315     },
    316     {
    317       "flag": "'Up to' framing",
    318       "detail": "Key results are reported as 'up to' maximums (69.85% JCT reduction, 1.2x throughput). The median or average improvements are not clearly stated, which could overstate typical gains."
    319     }
    320   ],
    321   "cited_papers": [
    322     {
    323       "title": "TVM: An automated End-to-End optimizing compiler for deep learning",
    324       "authors": ["Tianqi Chen", "Thierry Moreau", "Ziheng Jiang"],
    325       "year": 2018,
    326       "relevance": "Foundational DNN compiler framework that ConCo extends for concurrent execution scenarios."
    327     },
    328     {
    329       "title": "Ansor: Generating High-Performance tensor programs for deep learning",
    330       "authors": ["Lianmin Zheng", "Chengfan Jia", "Minmin Sun"],
    331       "year": 2020,
    332       "relevance": "The auto-scheduling system that ConCo builds upon and compares against as primary baseline."
    333     },
    334     {
    335       "title": "VELTAIR: towards high-performance multi-tenant deep learning services via adaptive compilation and scheduling",
    336       "authors": ["Zihan Liu", "Jingwen Leng", "Zhihui Zhang"],
    337       "year": 2022,
    338       "relevance": "Prior work on compilation optimization for concurrent DNN inference on CPUs; ConCo addresses why this approach fails on GPUs."
    339     },
    340     {
    341       "title": "ROLLER: Fast and efficient tensor compilation for deep learning",
    342       "authors": ["Hongyu Zhu", "Ruofan Wu", "Yijia Diao"],
    343       "year": 2022,
    344       "relevance": "DNN compiler for operator-level optimization; provides the benchmark operator collection used in ConCo's evaluation."
    345     },
    346     {
    347       "title": "A survey of multi-tenant deep learning inference on gpu",
    348       "authors": ["Fuxun Yu", "Di Wang", "Longfei Shangguan"],
    349       "year": 2022,
    350       "relevance": "Survey of GPU sharing for DNN inference, providing context for the multi-tenant inference problem ConCo addresses."
    351     }
    352   ]
    353 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs