scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (18884B)
      1 {
      2   "paper": {
      3     "title": "Autonomous Supplier Evaluation and Data Stewardship with AI: Building Transparent and Resilient Supply Chains",
      4     "authors": ["Chandra Bonthu", "Ganpati Goel"],
      5     "year": 2025,
      6     "venue": "International Journal of Computational and Experimental Science and Engineering (IJCESEN)",
      7     "doi": "10.22399/ijcesen.3854"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL, code archive, or any mention of code release found in the paper."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper describes a manufacturing-level schema with multiple data sources but does not release any dataset or provide download links."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions 'pinned containers' and 'locked versions of an operating system, compiler, and CUDA/cuDNN' conceptually but does not provide actual version numbers, requirements files, or environment specifications."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README, or scripts are provided."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper discusses metrics like PR-AUC, F1@k, NDCG, Brier score, and ECE but does not report any confidence intervals or error bars on these results."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims the full stack 'surpasses baselines' but provides no statistical significance tests. Comparisons are presented as qualitative findings in Table 2 without p-values or formal tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Table 2 describes findings qualitatively (e.g., 'Recall drops', 'Ranking degrades') without reporting actual numerical effect sizes, percentage improvements, or magnitude of differences."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper does not disclose the number of suppliers, purchase orders, or observations in the dataset, let alone justify the sample size."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations, variance across runs, or spread measures are reported for any experimental results."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Section 5.2 states 'Baselines included a weighted scorecard, logistic regression on tabular features, and a gradient-boosted tree trained without text or graph input.'"
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The baselines are heuristic scorecards, logistic regression, and GBT without text/graph — these are reasonable but no contemporary ML-based supplier evaluation methods from the literature are compared against."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Table 2 presents ablations removing text features, graph features, comparing model families (tree ensembles vs tabular deep models), and comparing calibration methods (Platt vs isotonic)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper uses PR-AUC, F1@k, NDCG@5/10, Brier score, and ECE as evaluation metrics."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The paper describes human-in-the-loop triage and manual audit of text-dominated cases in Section 5.4, but these are part of the system design, not a systematic human evaluation of system outputs."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 5.1 describes a rolling-origin procedure with sequential training, validation, and test windows, and supplier blocking to ensure unseen suppliers in test folds."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The paper mentions delivery, quality, and compliance risk categories and multiple horizons (30/60/90 days), but Table 2 does not provide per-category numerical breakdowns."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 5.4 discusses false positives (episodic expedite spikes not reflecting continuity risk) and false negatives (aggressive caches obscuring near-miss delays), with qualitative error analysis."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The calibration ablation reports that Platt scaling yields 'higher variance than isotonic', and the paper acknowledges tree ensembles outperform deep tabular models, suggesting the latter was a less successful approach."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "The abstract claims 'Experiments show that it is better for early warning of risks' and 'calibrated ranking strategies are more effective than static thresholds,' but the results in Table 2 are entirely qualitative with no numerical values to verify these claims."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Ablation studies systematically remove individual modalities (text, graph) to measure their contribution, constituting controlled single-variable manipulation adequate for the causal claims made about component contributions."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 6.4 explicitly bounds generalizability: 'Category dynamics constrain generalizability: semiconductor lead-time shocks contrast with packaging disruptions.' The paper acknowledges results are contingent on governance quality and organizational maturity."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 6.4 discusses survivorship bias, coverage gaps at tier-n levels, labeling distortion from reporting delays, and measurement error in OCR-extracted certificates as alternative explanations for observed performance."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper refers to 'gradient-boosted tree', 'transformer-based models', 'tabular deep models' without specifying exact model names, versions, or library versions."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "The paper does not use LLM prompting. It uses traditional ML models (gradient-boosted trees, transformer encoders for text embedding)."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Section 5.1 mentions 'Hyperparameters were identified on validation ranges via a multi-objective search' but does not report the actual hyperparameter values used."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The system is a traditional ML pipeline, not an LLM agent."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3.1 and 3.2 document data preprocessing in detail: entity resolution, FX normalization, unit dictionaries, UTC synchronization, working-day calendars, SLA harmonization, temporal truncation at label anchor, and data contract enforcement."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6.4 is titled 'Limitations & Threats to Validity' and provides substantive discussion."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 6.4 discusses specific threats: survivorship bias from attrited suppliers, tier-n coverage gaps undermining centrality features, labeling distortion from delayed shipment reports, OCR measurement error in certificates, and model-behavior drift from supplier adaptation to inspection routines."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6.4 states that gains are 'contingent upon the quality of governance, the maturity of data-observability, and the organization's ability to take action on recommendations.' Category-specific dynamics (semiconductor vs packaging) are noted as limiting generalizability."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data is released. The paper describes the schema (Table 1) but provides no access to the underlying procurement data."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3.1 and Table 1 describe data sources in detail: purchase orders, goods receipt notes, ASN, accounts payable, QMS records, audit findings, TMS milestones, certificate OCR, and external risk feeds."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. The data comes from enterprise procurement systems, not recruited subjects."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Sections 3.1-3.3 document the pipeline from raw data sources through entity resolution, normalization, feature engineering, labeling with temporal safeguards, and governance controls at each stage."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding source or acknowledgments section found in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Chandra Bonthu at EVERSANA (Director MDM) and Ganpati Goel at Zero Motorcycles Inc."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence cannot be assessed. The authors work at companies (EVERSANA, Zero Motorcycles) that could benefit from supplier evaluation systems, but this potential conflict is not discussed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement or financial interest declaration found in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper uses traditional ML models (gradient-boosted trees, transformer encoders) trained on proprietary data, not pre-trained LLMs evaluated on benchmarks."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No pre-trained model is evaluated on a benchmark. The paper trains its own models on proprietary procurement data with temporal cross-validation."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No pre-trained model benchmark evaluation. The paper uses internal procurement data, not public benchmarks."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in a research study sense. The paper describes a system deployment, not a human subjects study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "Section 5.1 mentions 'inference latency' as part of multi-objective hyperparameter search and Section 5.3 mentions load testing inference delays, but no actual cost or latency numbers are reported."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No GPU hours, training time, or computational budget is reported despite describing containerized training with CUDA/cuDNN."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The full stack (tabular + text + graph) surpasses baselines for early risk detection in delivery, quality, and compliance.",
    286       "evidence": "Table 2 and Section 5.2 state the full stack surpasses baselines but provide only qualitative findings without numerical results.",
    287       "supported": "weak"
    288     },
    289     {
    290       "claim": "Removing text features (audit/narrative embeddings) causes recall to drop for compliance-related events.",
    291       "evidence": "Table 2 modality ablation and Section 5.2 describe the ablation qualitatively: 'Removal of unstructured features decreased recall at matched precision of compliance-related events.'",
    292       "supported": "weak"
    293     },
    294     {
    295       "claim": "Removing graph features degrades ranking for thin-history suppliers.",
    296       "evidence": "Table 2 and Section 5.2: 'omitting the feature set associated with graphs, the poor rank statistics on the thin-history suppliers were not observed.' No numerical values provided.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "Tree ensembles show stronger discrimination and resilience to missing data compared to tabular deep models.",
    301       "evidence": "Table 2 model family ablation, described qualitatively in Section 5.2 without specific metrics.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "Calibrated ranking strategies are more effective than static thresholds in limited review capacity settings.",
    306       "evidence": "Abstract and Section 5.2 mention decision-curve analysis but provide no numerical comparisons of capture rates or expected costs.",
    307       "supported": "weak"
    308     },
    309     {
    310       "claim": "Stewardship practices decrease override rates and improve adoption.",
    311       "evidence": "Section 5.5 states 'precipitous decreases in rates because calibration had stabilized' and Section 6.5 mentions 'lower override levels' but provides no specific numbers.",
    312       "supported": "weak"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval", "case-study"],
    316   "key_findings": "The paper proposes a multi-modal ML pipeline for supplier risk evaluation combining tabular, text (audit narratives), and graph (supply network) features with data governance controls. Ablation studies suggest each modality contributes unique signal, with text features improving compliance recall and graph features improving ranking for low-history suppliers. However, all experimental results are reported only qualitatively in Table 2 without any numerical values, confidence intervals, or statistical tests, making it impossible to assess the magnitude or reliability of claimed improvements.",
    317   "red_flags": [
    318     {
    319       "flag": "No numerical results",
    320       "detail": "Table 2 (the main results table) contains only qualitative descriptions of findings ('Recall drops', 'Ranking degrades', 'Tree ensembles show stronger discrimination') without a single numerical value. No PR-AUC scores, F1 values, NDCG numbers, or Brier scores are reported despite being listed as evaluation metrics."
    321     },
    322     {
    323       "flag": "Dataset size undisclosed",
    324       "detail": "The number of suppliers, purchase orders, events, or observations in the dataset is never stated. Without knowing the scale, it is impossible to assess whether the evaluation is meaningful."
    325     },
    326     {
    327       "flag": "No quantitative evidence for any claim",
    328       "detail": "Every empirical claim in the paper (abstract and Section 5.2) is unsupported by numerical evidence. The paper describes metrics and evaluation procedures in detail but never reports actual measured values."
    329     },
    330     {
    331       "flag": "Potential industry conflict not disclosed",
    332       "detail": "Authors work at EVERSANA (MDM/data management) and Zero Motorcycles (manufacturing). Both could benefit from supplier evaluation tools. No competing interests statement is provided."
    333     }
    334   ],
    335   "cited_papers": [
    336     {
    337       "title": "Attention Is All You Need",
    338       "authors": ["Vaswani, A."],
    339       "year": 2017,
    340       "relevance": "Foundational transformer architecture referenced for text encoding of audit narratives in the pipeline."
    341     },
    342     {
    343       "title": "A Dynamic Memory-Based Approach for Natural Language Inference",
    344       "authors": ["Raju"],
    345       "year": 2017,
    346       "relevance": "Referenced for dynamic memory inference approach used in the text encoder to reconcile contradictory audit notes."
    347     },
    348     {
    349       "title": "Dual Sourcing for Supply Chain Resilience",
    350       "authors": ["Goel", "Bhramhabhatt"],
    351       "year": 2024,
    352       "relevance": "Referenced for dual-sourcing strategies as risk mitigation in supply chains, authored by one of the paper's authors."
    353     }
    354   ]
    355 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs