ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25547B)


      1 {
      2   "paper": {
      3     "title": "Semi-Supervised Cascaded Clustering for Classification of Noisy Label Data",
      4     "authors": [
      5       "Ashit Gupta",
      6       "Anirudh Deodhar",
      7       "Tathagata Mukherjee",
      8       "Venkataramana Runkana"
      9     ],
     10     "year": 2022,
     11     "venue": "arXiv preprint",
     12     "arxiv_id": "2205.02209",
     13     "doi": "10.48550/arXiv.2205.02209"
     14   },
     15   "scan_version": 2,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "The paper proposes SSCC, a semi-supervised cascaded clustering algorithm with a novel Cluster Evaluation Matrix (CEM) for classifying data with noisy labels. Tested on 4 small datasets (Coal, Ecoli, Wine, Eucalyptus) with 10-30% injected noise, SSCC generally maintains higher accuracy than SVM as noise increases. The approach was developed for coal type classification in thermal power plants. However, the evaluation uses only one baseline (SVM), reports only accuracy without statistical tests, and makes broad generalization claims from very small datasets.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No code repository, GitHub link, or any source code release is mentioned in the paper."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Three of four datasets are from public sources: Ecoli and Wine from UCI ML Repository [32], Eucalyptus from [35,36]. Coal dataset references a published handbook [31]. The paper states 'datasets available in the public domain' (Section 4)."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No environment specifications, library versions, programming language, or dependency information is provided anywhere in the paper."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No reproduction instructions, README, or runnable scripts are provided. The algorithms are described but exact implementation details for reproduction are missing."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Table 2 reports only point estimates of accuracy (e.g., '86.4%'). No confidence intervals, error bars, or ± notation appear anywhere in the paper."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims SSCC 'performed better in comparison with SVM' but no statistical significance tests (t-tests, p-values, etc.) are applied to any comparisons."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Table 2 shows raw accuracy percentages for each method but the paper never explicitly reports effect sizes, relative improvements, or Cohen's d. Differences must be computed by the reader from the table."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Dataset sizes range from 179 to 680 observations (Table 1). No justification is provided for why these sample sizes are adequate for the claims being made, nor is any power analysis discussed."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. Results appear to be from single runs with no indication of variability."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "SVM is used as a baseline comparison across all datasets and noise levels (Table 2)."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "The only baseline is SVM (reference [30] from 2001). The paper discusses DivideMix [24] and SemiNLL [25] (both 2020) as state-of-the-art noisy-label methods but does not compare against them, arguing they need large datasets. No other noisy-label-specific baselines are included."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No ablation study is performed. The paper does not systematically evaluate the contribution of individual components (CEM, completeness score threshold, cascading) by removing them. Testing k-means vs k-medoids is not an ablation of the algorithm's components."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Only classification accuracy (% accuracy) is reported in Table 2. No F1 score, precision, recall, AUC, or other metrics are used."
     89       },
     90       "human_evaluation": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "Human evaluation is not relevant for this automated clustering/classification algorithm evaluation on tabular datasets."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper states 'Each dataset was divided into train (for SSCC 85%-90%) and test (for cascaded classifier 10%-15%) data. The test data for each dataset was free from noisy-labels and was kept constant across the variations.' (Section 4)"
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Table 2 provides per-dataset breakdowns across all four datasets and for each noise level (0%, 10%, 20%, 30%), showing how performance varies across conditions."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "No qualitative error analysis or failure case examples are presented. Section 4.3 discusses general limitations but does not analyze specific instances where the algorithm failed."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper honestly reports that for the Wine dataset 'both the models did not show any drop in accuracy due to limited number of test observation and relatively shorter size of the dataset' (Section 4.2), acknowledging the experiment was uninformative for that dataset. Ecoli results show SSCC kmeans (77.3%) performing comparably to SVM (77.3%) at 0% noise."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims SSCC 'performed better in comparison with the support vector machines (SVM) when tested on multiple noisy-label datasets.' Table 2 generally supports this — SSCC accuracy is more stable than SVM as noise increases across Coal, Eucalyptus, and Ecoli datasets."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper makes causal claims like 'The algorithm reduces the dependency on expensive human expertise' and 'SSCC algorithm filtered out the noisy information through the CS and CEM criteria.' These mechanistic claims are not verified through controlled ablation, and no significance tests support the comparative claims."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper claims 'The proposed approach can be effectively used for deriving actionable insights in industrial settings with minimal human expertise' and 'it can be effectively used for any scenario in real world where classification is important' — extremely broad generalizations from 4 small datasets (179-680 observations)."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper does not discuss alternative explanations for why SSCC outperforms SVM. For instance, the noise injection method (random label flipping) may favor clustering-based approaches over discriminative ones, but this is not considered."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper measures classification accuracy and claims classification accuracy — there is no proxy gap. The measurements match the granularity of the claims."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "No software library or implementation versions are specified for k-means, k-medoids, or SVM. The paper does not mention which ML library was used or what SVM kernel/parameters were employed."
    148       },
    149       "prompts_provided": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "The paper does not use any language model prompting. It uses classical ML algorithms (k-means, k-medoids, SVM)."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Table 3 in Appendix A provides SSCC threshold hyperparameters (λCEM, λCS, λOL) for all four datasets across both k-means and k-medoids variants. Default values are also stated in Section 4.1."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. This is a classical ML algorithm."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "The paper documents normalization ('normalize the data using mean and standard deviation,' Algorithm-1), train/test split ratios (85-90%/10-15%), and noise injection procedure ('10% to 30% noisy-labels were introduced randomly,' Section 4)."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 4.3 'Possible Improvements' serves as a limitations section, discussing computational expense, feature selection limitations, and need for automated hyperparameter selection."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 4.3 identifies specific limitations: 'noisy-label datasets where distance-based clustering may not be applicable,' 'computationally expensive, especially for large number of features,' and 'Selection of hyperparameters for SSCC clustering varies with respect to data, domain, and desirable outcomes.'"
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "Despite specific limitations in Section 4.3, the paper does not explicitly bound its claims. It still asserts the algorithm 'can be effectively used for any scenario in real world where classification is important' without stating what the results do NOT show."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Three of four datasets are publicly available: Ecoli and Wine from UCI ML Repository [32], Eucalyptus from published sources [35,36]. Coal dataset references a published handbook [31]. The paper states these are 'datasets available in the public domain' (Section 4)."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Table 1 describes dataset characteristics (observations, labels, features). Source references are provided for all datasets. The noise injection process is described: 'A 10% to 30% noisy-labels were introduced randomly into the training data' (Section 4)."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. Data comes from standard public datasets and a reference handbook."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The pipeline is documented: dataset selection → normalization → train/test split (85-90%/10-15%) → noise injection (10-30% random) → SSCC clustering on training data → classifier evaluation on clean test data (Section 4)."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding source, acknowledgments section, or grant information is mentioned anywhere in the paper."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "All four authors clearly list their affiliation as 'TCS Research, Tata Consultancy Services Ltd., Pune, Maharashtra, India' in the paper header."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "The work is from TCS Research and is 'developed as a part of a digital twin solution for a thermal power plant.' TCS has a commercial interest in demonstrating this algorithm works for their products, making the funder non-independent of the outcome."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests statement or financial interest disclosures appear in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "This paper uses classical ML algorithms (k-means, k-medoids, SVM) trained from scratch on tabular datasets. No pre-trained model with a training data cutoff is evaluated."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No pre-trained model is evaluated on a benchmark. The models are trained from scratch with explicit train/test splits."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No pre-trained model is used, so benchmark contamination in the pre-training sense is not applicable."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The study evaluates algorithms on tabular datasets."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants involved in the study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants involved in the study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants involved in the study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants involved in the study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants involved in the study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants involved in the study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No inference cost, latency, or runtime is reported. Section 4.3 mentions the algorithm is 'computationally expensive, especially for large number of features' but provides no quantitative cost data."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No computational budget, hardware specifications, or total compute time is stated anywhere in the paper."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No mention of random seeds or sensitivity analysis across seeds. K-means has random initialization and noise injection is random, but neither is controlled for seed sensitivity."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The number of experimental runs is not stated. Results appear to be from single runs with no indication of averaging across multiple trials."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "Section 4.1 states 'on the basis of experiments conducted, the default values of threshold parameters were found' but does not report how many configurations were tried, what search method was used, or the computational cost of the search."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Table 3 shows different hyperparameters per dataset but the selection process is not justified. The paper says values were 'found' through experiments without describing selection criteria or showing results for other configurations."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "Multiple comparisons are made across 4 datasets, 4 noise levels, and 3 methods, but no statistical tests are performed at all, let alone corrections for multiple comparisons."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors compare their own SSCC algorithm against their own implementation of SVM without acknowledging the bias of evaluating their own system or the risk of implementing baselines suboptimally."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of computational cost relative to performance. Section 4.3 acknowledges SSCC is 'computationally expensive' but does not quantify this or compare compute requirements with the SVM baseline."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper does not discuss whether accuracy on these small tabular datasets actually validates the approach for the claimed use case of industrial noisy-label classification at scale."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No agentic scaffolding is involved. This is a classical ML algorithm."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "Models are trained from scratch on the provided datasets. No pre-trained model with temporal training data is used, so temporal leakage is structurally inapplicable."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The paper correctly normalizes test data using training set statistics (Algorithm-2) but does not explicitly discuss feature leakage or whether the normalization procedure could introduce information leakage."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "The paper randomly splits data into train/test but does not discuss whether examples may share structural dependencies (e.g., coal samples from the same source, Ecoli proteins from the same family)."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention methods are applied. The train/test split is described as random with no verification of independence."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "SSCC classifier performs better than SVM on noisy-label datasets",
    370       "evidence": "Table 2 shows SSCC maintaining higher or comparable accuracy to SVM as noise increases from 10-30% across Coal, Eucalyptus, and Ecoli datasets. For Coal at 30% noise: SSCC achieves 100% (kmeans) and 100% (kmedoids) train/validation vs SVM test 86.4%.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "SSCC eliminates noisy labels through CEM criteria and cascaded architecture",
    375       "evidence": "The CEM mechanism is described in Section 2 (Equation 4) and Algorithm-1 shows the outlier removal step (λOL threshold). However, no direct verification that noisy labels are correctly identified is provided.",
    376       "supported": "weak"
    377     },
    378     {
    379       "claim": "SSCC is accurate and consistent despite variations in mislabeling",
    380       "evidence": "Table 2 shows SSCC kmeans maintaining 100% train accuracy across all noise levels for Coal data, and 99.1% for Eucalyptus. However, this applies to training/validation, not the independent test set. Wine dataset showed no discrimination between methods.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "The approach can be effectively used for any scenario in real world where classification is important and data has noisy labels",
    385       "evidence": "Tested on 4 small datasets (179-680 observations) with synthetically injected noise. No real-world deployment results or comparison with contemporary noisy-label methods (DivideMix, SemiNLL).",
    386       "supported": "weak"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Single weak baseline",
    392       "detail": "Only vanilla SVM is used as a baseline despite the paper citing DivideMix and SemiNLL as state-of-the-art noisy-label methods. The SVM reference is from 2001. No other noisy-label-specific baselines are included."
    393     },
    394     {
    395       "flag": "No error bars or statistical tests",
    396       "detail": "All results in Table 2 are point estimates with no confidence intervals, standard deviations, or statistical significance tests. K-means has random initialization and noise injection is random, making single-run results unreliable."
    397     },
    398     {
    399       "flag": "Suspiciously perfect training accuracy",
    400       "detail": "SSCC kmeans achieves 100% training accuracy on Coal data even with 30% noisy labels injected. While the algorithm removes noisy labels, the perfect score across all noise levels warrants scrutiny."
    401     },
    402     {
    403       "flag": "Overclaiming from small datasets",
    404       "detail": "The paper claims the approach 'can be effectively used for any scenario in real world' based on 4 small datasets (179-680 observations). This is a dramatic overgeneralization."
    405     },
    406     {
    407       "flag": "Single metric evaluation",
    408       "detail": "Only classification accuracy is reported. For datasets with class imbalance (Ecoli has 7 uneven classes), accuracy alone can be misleading. F1, precision/recall, or balanced accuracy would be more informative."
    409     },
    410     {
    411       "flag": "Corporate conflict of interest",
    412       "detail": "All authors are from TCS Research and the algorithm is 'developed as a part of a digital twin solution for a thermal power plant.' The company has a commercial interest in positive results. No competing interests statement is provided."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "DivideM ix: Learning with noisy labels as semi-supervised learning",
    418       "authors": ["Junnan Li", "Richard Socher", "Steven CH Hoi"],
    419       "year": 2020,
    420       "arxiv_id": "2002.07394",
    421       "relevance": "State-of-the-art noisy-label learning method combining semi-supervised learning with noise-robust training, relevant to understanding robust ML training methodologies."
    422     },
    423     {
    424       "title": "SemiNLL: A framework of noisy-label learning by semi-supervised learning",
    425       "authors": ["Zhuowei Wang", "Jing Jiang", "Bo Han", "Lei Feng", "Bo An", "Gang Niu", "Guodong Long"],
    426       "year": 2020,
    427       "arxiv_id": "2012.00925",
    428       "relevance": "Framework for handling noisy labels via semi-supervised learning, relevant to understanding data quality challenges in ML training pipelines."
    429     },
    430     {
    431       "title": "Understanding deep learning (still) requires rethinking generalization",
    432       "authors": ["Chiyuan Zhang", "Samy Bengio", "Moritz Hardt", "Benjamin Recht", "Oriol Vinyals"],
    433       "year": 2021,
    434       "relevance": "Foundational work showing DNNs overfit noisy labels, relevant to understanding model robustness and generalization in AI systems."
    435     },
    436     {
    437       "title": "Auto-WEKA: Combined selection and hyperparameter optimization of classification algorithms",
    438       "authors": ["Chris Thornton", "Frank Hutter", "Holger H Hoos", "Kevin Leyton-Brown"],
    439       "year": 2013,
    440       "relevance": "Early AutoML work on automated model selection and hyperparameter optimization, relevant to the survey's coverage of automated ML pipelines."
    441     },
    442     {
    443       "title": "Towards automated semi-supervised learning",
    444       "authors": ["Yu-Feng Li", "Hai Wang", "Tong Wei", "Wei-Wei Tu"],
    445       "year": 2019,
    446       "relevance": "Work on automating semi-supervised classification, relevant to understanding the state of automated ML methodology."
    447     }
    448   ]
    449 }

Impressum · Datenschutz