scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21991B)
      1 {
      2   "paper": {
      3     "title": "The Role of Artificial Intelligence in Enhancing Operational Efficiency and Cost Optimization in Engineering-Driven Enterprises",
      4     "authors": ["Nandha Kumar B", "Balaji Jayakrishnan", "Toufik Mzili"],
      5     "year": 2025,
      6     "venue": "International Journal on Engineering Artificial Intelligence Management, Decision Support, and Policies",
      7     "doi": "10.63503/j.ijaimd.2025.169"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor"],
     11   "checklist": {
     12     "artifacts": {
     13       "code_released": {
     14         "applies": true,
     15         "answer": false,
     16         "justification": "No code repository, GitHub link, or supplementary materials are mentioned anywhere in the paper. The MATLAB simulation code is not released."
     17       },
     18       "data_released": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "The paper uses synthetic data that is 'programmatically created and is embedded in the code' (Section 4), but neither the code nor the data is released."
     22       },
     23       "environment_specified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "The paper mentions MATLAB was used but provides no version number, toolbox requirements, or environment specifications."
     27       },
     28       "reproduction_instructions": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No reproduction instructions, README, or step-by-step guide is provided. The algorithmic description in Section 4 is not sufficient to reproduce the exact results."
     32       }
     33     },
     34     "statistical_methodology": {
     35       "confidence_intervals_or_error_bars": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Table 2 reports mean, std deviation, min, and max, but no confidence intervals or error bars are reported on the main comparison figures (Figs 4-5 are box plots showing distribution but no CIs on the means)."
     39       },
     40       "significance_tests": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Paired t-tests are reported with p-values < 0.001 for both cost and duration comparisons (Table 2)."
     44       },
     45       "effect_sizes_reported": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper reports percentage improvements with baseline context: 18.2% cost reduction (from $1,500,400 to $1,225,300) and 23.5% duration reduction (from 240 to 183 days), with full baseline numbers in Table 2."
     49       },
     50       "sample_size_justified": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The simulation uses 10 project configurations run 100 times each, but no justification is given for why these numbers were chosen. No power analysis is discussed."
     54       },
     55       "variance_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Standard deviations are reported in Table 2 for both models across both metrics (e.g., Model A cost: std dev 115,400; Model B cost: std dev 180,200)."
     59       }
     60     },
     61     "evaluation_design": {
     62       "baselines_included": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper compares the GA (Model A) against a priority-based heuristic using Shortest Processing Time rule (Model B) as a baseline."
     66       },
     67       "baselines_contemporary": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "The baseline is a simple SPT heuristic, which is a basic scheduling rule. No contemporary metaheuristics (PSO, simulated annealing, ant colony) or other AI-based scheduling methods are compared, making the baseline suspiciously weak."
     71       },
     72       "ablation_study": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No ablation study is performed. The GA has multiple components (crossover, mutation, elitism, fitness weights) but none are ablated to determine their individual contributions."
     76       },
     77       "multiple_metrics": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The paper evaluates on multiple metrics: total project cost, total project duration (makespan), resource utilization, and computational time (Tables 2-3, Figs 4-5, 7)."
     81       },
     82       "human_evaluation": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "Human evaluation is not relevant for comparing optimization algorithm performance on synthetic scheduling problems."
     86       },
     87       "held_out_test_set": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "This is a simulation-based optimization study, not a machine learning evaluation with train/test splits. The concept of held-out test sets does not apply."
     91       },
     92       "per_category_breakdown": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "Results are aggregated across all 10 project configurations. No per-project breakdown showing how the GA performs on individual projects with varying complexity is provided."
     96       },
     97       "failure_cases_discussed": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No failure cases or scenarios where the GA underperforms or struggles are discussed. All results show uniform improvement."
    101       },
    102       "negative_results_reported": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "No negative results are reported. The only acknowledged downside is computational cost (Table 3: 45.6s vs 0.8s), but this is dismissed rather than analyzed as a meaningful limitation."
    106       }
    107     },
    108     "claims_and_evidence": {
    109       "abstract_claims_supported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The abstract claims 18.2% cost reduction and 23.5% time reduction, which match the results in Table 2. The claim of statistical significance is supported by p < 0.001."
    113       },
    114       "causal_claims_justified": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "The paper claims the AI-based solution 'reduces project expenses by 18.2%' — this is causal language. While the comparison is controlled (same synthetic data), the synthetic nature means the causal claim only holds for the specific simulated scenarios, which is not acknowledged."
    118       },
    119       "generalization_bounded": {
    120         "applies": true,
    121         "answer": false,
    122         "justification": "The title claims 'Engineering-Driven Enterprises' broadly, and the conclusion states this is 'a significant step for businesses with an engineering focus.' However, results are from a single synthetic dataset with programmatically generated parameters — no real-world validation. The generalization far exceeds the evidence."
    123       },
    124       "alternative_explanations_discussed": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "No alternative explanations are discussed. The paper does not consider whether the GA's advantage is specific to the synthetic data structure, parameter ranges, or problem size. The possibility that the weak baseline inflates the improvement is not addressed."
    128       },
    129       "proxy_outcome_distinction": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper measures cost and duration on synthetic projects but frames results as evidence for 'operational efficiency' and 'cost optimization' in real engineering enterprises. The gap between simulated scheduling performance and real-world operational efficiency is not acknowledged."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": false,
    138         "answer": false,
    139         "justification": "This paper does not use any pre-trained AI/LLM models. The GA is a custom implementation, not a versioned model."
    140       },
    141       "prompts_provided": {
    142         "applies": false,
    143         "answer": false,
    144         "justification": "No prompting is used in this paper. The GA is a classical optimization algorithm, not an LLM-based approach."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "Key GA hyperparameters (population size, number of generations, crossover rate pc, mutation rate pm, elitism count, tournament size, fitness weights w1/w2) are referenced symbolically but their actual values used in the simulation are not reported."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No agentic scaffolding is used. This is a classical GA implementation."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "The synthetic data generation process is described conceptually (tasks have IDs, durations, costs, resource requirements, dependencies) but the actual parameter ranges, distributions, and generation procedure are not documented in sufficient detail for reproduction."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "There is no dedicated limitations section. A single sentence at the end of Section 5 acknowledges the synthetic dataset and deterministic model, but this is not a substantive limitations discussion."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": false,
    171         "justification": "The only limitation mentioned is that 'it uses the deterministic simulation model and a synthetic dataset' and that 'Real projects face great uncertainty.' This is a generic observation, not a specific threat analysis."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No specific scope boundaries are stated. The paper does not explicitly state what the results do NOT show or what settings are excluded from the claims."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No raw simulation data is available. Neither the synthetic project configurations nor the per-run results are released."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "The synthetic data generation is described at a high level (tasks with properties) but the actual parameter values, distributions, and project structures used are not specified."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": false,
    192         "answer": false,
    193         "justification": "No human participants; data is synthetically generated. Recruitment methods do not apply."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "The pipeline from data generation to results is not documented in detail. How the 10 project configurations were generated, how the 100 runs per configuration were aggregated, and what specific parameters varied are not explained."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The paper explicitly states 'Funding source: None.'"
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations are clearly listed: VIT Chennai Business School and Chouaib Doukkali University."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": false,
    214         "answer": false,
    215         "justification": "The paper states it is unfunded, so funder independence is not applicable."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "The paper includes a conflict of interest statement: 'The authors declare no potential conflict of interest in this publication.'"
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": false,
    226         "answer": false,
    227         "justification": "This paper does not evaluate a pre-trained model on any benchmark. The GA is a classical optimization algorithm run on synthetic data."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": false,
    231         "answer": false,
    232         "justification": "No pre-trained model is evaluated on benchmarks. Train/test overlap is not applicable."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No pre-trained model is evaluated on benchmarks. Contamination is not applicable."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human participants in this study."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants in this study."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": true,
    281         "justification": "Table 3 reports computational time: GA averages 45.6 seconds, heuristic 0.8 seconds. This serves as a practical cost measure."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "No total computational budget is stated. The hardware used for simulation is not specified, and total simulation time across all configurations and runs is not reported."
    287       }
    288     },
    289     "experimental_rigor": {
    290       "seed_sensitivity_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "While the GA is run 100 times per configuration, no explicit seed sensitivity analysis is reported. The variance in Table 2 may partially reflect seed sensitivity but this is not discussed."
    294       },
    295       "number_of_runs_stated": {
    296         "applies": true,
    297         "answer": true,
    298         "justification": "Section 5 states '10 distinct, synthetically generated project configurations, with each configuration run 100 times.'"
    299       },
    300       "hyperparameter_search_budget": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No hyperparameter search budget is reported. The GA hyperparameters (population size, crossover/mutation rates, etc.) appear chosen but without reporting how they were selected or how many configurations were tried."
    304       },
    305       "best_config_selection_justified": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "No explanation of how the GA configuration was selected. The hyperparameter values are not even reported, let alone justified."
    309       },
    310       "multiple_comparison_correction": {
    311         "applies": false,
    312         "answer": false,
    313         "justification": "Only two main comparisons are made (cost and duration), so multiple comparison correction is not strictly necessary."
    314       },
    315       "self_comparison_bias_addressed": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The authors implement both the GA and the baseline heuristic. The baseline is a deliberately simple SPT heuristic. No acknowledgment of the bias in evaluating their own system against a weak baseline they also implemented."
    319       },
    320       "compute_budget_vs_performance": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The GA uses ~57x more compute than the heuristic (45.6s vs 0.8s), but performance is not analyzed as a function of compute. No comparison at matched compute budgets."
    324       },
    325       "benchmark_construct_validity": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The synthetic benchmark's validity as a proxy for real engineering project scheduling is not discussed. Whether the programmatically generated task structures reflect real-world project complexity is not analyzed."
    329       },
    330       "scaffold_confound_addressed": {
    331         "applies": false,
    332         "answer": false,
    333         "justification": "No scaffolding is involved. This is a direct comparison of two scheduling algorithms."
    334       }
    335     }
    336   },
    337   "claims": [
    338     {
    339       "claim": "The AI-based GA reduces project expenses by 18.2% compared to the conventional heuristic.",
    340       "evidence": "Table 2: Model A mean cost $1,225,300 vs Model B mean cost $1,500,400, p < 0.001.",
    341       "supported": "moderate"
    342     },
    343     {
    344       "claim": "The AI-based GA reduces project duration by 23.5% compared to the conventional heuristic.",
    345       "evidence": "Table 2: Model A mean duration 183 days vs Model B mean duration 240 days, p < 0.001.",
    346       "supported": "moderate"
    347     },
    348     {
    349       "claim": "The AI model achieves higher and more balanced resource utilization than the heuristic.",
    350       "evidence": "Fig. 7 shows comparative resource utilization bar chart, but no numerical values are provided in the text.",
    351       "supported": "weak"
    352     },
    353     {
    354       "claim": "The GA's advantage grows in more resource-constrained scenarios.",
    355       "evidence": "Fig. 9 (labeled as sensitivity analysis) shows percentage improvement increasing as resource capacity decreases, but no numerical results are reported in the text.",
    356       "supported": "weak"
    357     }
    358   ],
    359   "methodology_tags": ["benchmark-eval"],
    360   "key_findings": "The paper proposes using a Genetic Algorithm for multi-objective project scheduling optimization, comparing it against a Shortest Processing Time heuristic on synthetic data. The GA achieves 18.2% cost reduction and 23.5% duration reduction with statistical significance (p < 0.001) across 10 project configurations run 100 times each. However, results are entirely based on synthetic data with no real-world validation, and the baseline is a deliberately weak heuristic rather than a competitive alternative.",
    361   "red_flags": [
    362     {
    363       "flag": "Weak baseline inflates improvements",
    364       "detail": "The only baseline is a simple SPT heuristic. No comparison against other metaheuristics (PSO, simulated annealing, ant colony optimization) that are standard in RCPSP literature. Comparing a population-based search against a simple greedy rule is not a meaningful test."
    365     },
    366     {
    367       "flag": "Entirely synthetic evaluation",
    368       "detail": "All results are on programmatically generated synthetic data with no real-world validation. The paper makes broad claims about 'engineering-driven enterprises' based solely on simulated toy problems."
    369     },
    370     {
    371       "flag": "Unreported hyperparameters",
    372       "detail": "Critical GA hyperparameters (population size, crossover rate, mutation rate, number of generations, fitness weights, tournament size) are described symbolically but their actual values are never reported, making reproduction impossible."
    373     },
    374     {
    375       "flag": "Irrelevant references",
    376       "detail": "Multiple references are unrelated to the paper's topic. Ref [5] is about Android app security, Ref [9] is about cloud file sharing security, Ref [10] is about ride-sharing pricing. This suggests poor scholarship or padding."
    377     },
    378     {
    379       "flag": "Overclaiming from simulation",
    380       "detail": "The conclusion states this is 'a significant step for businesses with an engineering focus' and a 'strategic shift' — claims far exceeding what a synthetic simulation study can support."
    381     },
    382     {
    383       "flag": "Figure numbering error",
    384       "detail": "Fig. 9 appears twice — once as a Gantt chart (Heuristic schedule) and once as a sensitivity analysis plot. This suggests careless preparation."
    385     }
    386   ],
    387   "cited_papers": [
    388     {
    389       "title": "AI Agents in Engineering Design: A Multi-Agent Framework for Aesthetic and Aerodynamic Car Design",
    390       "authors": ["M. Elrefaie", "J. Qian", "R. Wu", "Q. Chen", "A. Dai", "F. Ahmed"],
    391       "year": 2025,
    392       "arxiv_id": "2503.23315",
    393       "relevance": "Multi-agent AI system for engineering design — relevant to agentic AI applications."
    394     },
    395     {
    396       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    397       "authors": ["J. Becker", "N. Rush", "E. Barnes", "D. Rein"],
    398       "year": 2025,
    399       "arxiv_id": "2507.09089",
    400       "relevance": "Empirical measurement of AI impact on developer productivity — core survey topic."
    401     },
    402     {
    403       "title": "On the Opportunities and Risks of Foundation Models",
    404       "authors": ["R. Bommasani"],
    405       "year": 2021,
    406       "arxiv_id": "2108.07258",
    407       "relevance": "Foundational survey on foundation model risks and opportunities."
    408     },
    409     {
    410       "title": "A Survey of Large Language Models",
    411       "authors": ["W. X. Zhao"],
    412       "year": 2023,
    413       "arxiv_id": "2303.18223",
    414       "relevance": "Comprehensive LLM survey relevant to the survey scope."
    415     },
    416     {
    417       "title": "XGBoost: A Scalable Tree Boosting System",
    418       "authors": ["T. Chen", "C. Guestrin"],
    419       "year": 2016,
    420       "doi": "10.1145/2939672.2939785",
    421       "relevance": "Foundational ML method used in AI-based cost estimation systems discussed in the literature review."
    422     }
    423   ]
    424 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs