scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (15124B)
      1 {
      2   "paper": {
      3     "title": "AI-Integrated Software Engineering: Developing Systems that Evolve with Learning Capabilities",
      4     "authors": ["Snigdha Gaddam"],
      5     "year": 2025,
      6     "venue": "Journal of Information Systems Engineering and Management"
      7   },
      8   "checklist": {
      9     "artifacts": {
     10       "code_released": {
     11         "applies": true,
     12         "answer": false,
     13         "justification": "No repository URL or code archive is provided. The paper includes inline Python code snippets but no link to a downloadable or runnable codebase."
     14       },
     15       "data_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "The data is randomly generated within the code snippets. No dataset is released or archived."
     19       },
     20       "environment_specified": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The paper mentions NumPy, pandas, scikit-learn, TensorFlow, Matplotlib, and seaborn but provides no version numbers, requirements file, or environment specification."
     24       },
     25       "reproduction_instructions": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No step-by-step reproduction instructions are provided. Code snippets are embedded in the text but lack a complete runnable script or instructions."
     29       }
     30     },
     31     "statistical_methodology": {
     32       "confidence_intervals_or_error_bars": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Only point estimates are reported (97% accuracy). No confidence intervals or error bars."
     36       },
     37       "significance_tests": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No statistical significance tests are performed. The paper reports a single accuracy number with no comparative testing."
     41       },
     42       "effect_sizes_reported": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No effect sizes are reported. Only raw accuracy and feature importance values are presented."
     46       },
     47       "sample_size_justified": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The sample size (n_samples) is never stated explicitly in the paper text, and no justification or power analysis is provided."
     51       },
     52       "variance_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Only single-run results are reported. No variance, standard deviation, or multiple-run results."
     56       }
     57     },
     58     "evaluation_design": {
     59       "baselines_included": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No baseline comparisons are included. Only the Random Forest model is evaluated with no comparison to other methods."
     63       },
     64       "baselines_contemporary": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No baselines are included at all."
     68       },
     69       "ablation_study": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No ablation study is performed. The model uses four features but no experiments remove or vary components."
     73       },
     74       "multiple_metrics": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper reports accuracy, confusion matrix, classification report (precision, recall, F1), and feature importance values (Table 1, Section V-VI)."
     78       },
     79       "human_evaluation": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "The paper trains a classifier on synthetic data. Human evaluation of outputs is not relevant to its claims."
     83       },
     84       "held_out_test_set": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The code shows an 80/20 train-test split using train_test_split with random_state=42, and test accuracy is reported separately."
     88       },
     89       "per_category_breakdown": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The confusion matrix and classification report provide per-class breakdowns for the two decision outcome classes."
     93       },
     94       "failure_cases_discussed": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "The confusion matrix shows misclassifications exist but there is no discussion or analysis of what caused them."
     98       },
     99       "negative_results_reported": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "Every result presented is positive. No failed approaches, alternative models, or negative findings are reported."
    103       }
    104     },
    105     "claims_and_evidence": {
    106       "abstract_claims_supported": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "The abstract claims the model demonstrates 'intelligent decision-making' but the experiment only classifies randomly generated binary labels from random features — this does not demonstrate intelligent decision-making."
    110       },
    111       "causal_claims_justified": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "The paper claims user behavior and performance metrics 'influence the outcome of decisions' but the data is entirely randomly generated with no causal structure. Any apparent feature importance is an artifact of random correlation."
    115       },
    116       "generalization_bounded": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The title and abstract claim relevance to 'AI-Integrated Software Engineering' and 'systems that evolve with learning capabilities' but the experiment is a Random Forest on random data with no connection to software engineering systems."
    120       },
    121       "alternative_explanations_discussed": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "No alternative explanations are discussed. The high accuracy on random data could be due to overfitting or random correlation, but this is not considered."
    125       }
    126     },
    127     "setup_transparency": {
    128       "model_versions_specified": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "No versions are specified for any libraries used (scikit-learn, NumPy, pandas, etc.)."
    132       },
    133       "prompts_provided": {
    134         "applies": false,
    135         "answer": false,
    136         "justification": "No LLM prompting is used. The paper trains a Random Forest classifier."
    137       },
    138       "hyperparameters_reported": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The code shows n_estimators=100, random_state=42, and test_size=0.2 for the Random Forest model."
    142       },
    143       "scaffolding_described": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "No agentic scaffolding is used."
    147       },
    148       "data_preprocessing_documented": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section IV describes preprocessing steps: null checking, StandardScaler normalization, and train-test splitting. Code is provided inline."
    152       }
    153     },
    154     "limitations_and_scope": {
    155       "limitations_section_present": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "A 'Research Limitations' subsection exists in Section VI acknowledging the use of random data and limited feature analysis."
    159       },
    160       "threats_to_validity_specific": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "The limitations are generic: 'random data might not be a complete reflection of the complexities in the real world.' No specific threats to validity are identified."
    164       },
    165       "scope_boundaries_stated": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "No specific scope boundaries are stated. The paper does not clearly delineate what its results do and do not show beyond vague acknowledgment of synthetic data limitations."
    169       }
    170     },
    171     "data_integrity": {
    172       "raw_data_available": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No data is released. The data generation code is inline but no actual dataset is provided."
    176       },
    177       "data_collection_described": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section IV describes the data generation process: random normal distributions for continuous features and random binary choice for the target variable."
    181       },
    182       "recruitment_methods_described": {
    183         "applies": false,
    184         "answer": false,
    185         "justification": "No human participants. Data is synthetically generated."
    186       },
    187       "data_pipeline_documented": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The full pipeline from generation through preprocessing to model training is documented via inline code and text descriptions in Sections IV-V."
    191       }
    192     },
    193     "conflicts_of_interest": {
    194       "funding_disclosed": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No funding information or acknowledgments section is present."
    198       },
    199       "affiliations_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "The author is listed as 'Independent Researcher, USA' with an ORCID."
    203       },
    204       "funder_independent_of_outcome": {
    205         "applies": false,
    206         "answer": false,
    207         "justification": "Author is listed as an independent researcher with no disclosed funding."
    208       },
    209       "financial_interests_declared": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No competing interests or financial interests statement is present."
    213       }
    214     },
    215     "contamination": {
    216       "training_cutoff_stated": {
    217         "applies": false,
    218         "answer": false,
    219         "justification": "No pre-trained model is evaluated on a benchmark. A Random Forest is trained from scratch on synthetic data."
    220       },
    221       "train_test_overlap_discussed": {
    222         "applies": false,
    223         "answer": false,
    224         "justification": "No pre-trained model is evaluated on a benchmark."
    225       },
    226       "benchmark_contamination_addressed": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "No pre-trained model is evaluated on a benchmark."
    230       }
    231     },
    232     "human_studies": {
    233       "pre_registered": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "No human participants."
    237       },
    238       "irb_or_ethics_approval": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants."
    242       },
    243       "demographics_reported": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants."
    247       },
    248       "inclusion_exclusion_criteria": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants."
    252       },
    253       "randomization_described": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "blinding_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "attrition_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       }
    268     },
    269     "cost_and_practicality": {
    270       "inference_cost_reported": {
    271         "applies": true,
    272         "answer": false,
    273         "justification": "No inference cost, latency, or computational cost is reported."
    274       },
    275       "compute_budget_stated": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "No computational budget or hardware details are stated."
    279       }
    280     }
    281   },
    282   "claims": [
    283     {
    284       "claim": "The Random Forest model achieved 97% accuracy in classifying decision outcomes.",
    285       "evidence": "Table 1 reports Model Accuracy of 97.00% and Training Accuracy of 97.50% (Section VI).",
    286       "supported": "weak"
    287     },
    288     {
    289       "claim": "User behavior and performance metric are the most important features influencing decision outcomes.",
    290       "evidence": "Feature importance chart (Fig 5) and Table 1 show feature importance values. However, user_behavior has importance 0.175, which is actually the lowest, contradicting the text claim (Section VI).",
    291       "supported": "unsupported"
    292     },
    293     {
    294       "claim": "AI-native systems can replicate intelligent decision-making through Random Forest classifiers.",
    295       "evidence": "The experiment trains a classifier on randomly generated data with no connection to real AI-native systems. The 97% accuracy on random data does not demonstrate intelligent decision-making.",
    296       "supported": "unsupported"
    297     }
    298   ],
    299   "methodology_tags": ["benchmark-eval"],
    300   "key_findings": "The paper trains a Random Forest classifier on synthetically generated random data with four continuous features and a binary target, achieving 97% accuracy. Feature importance analysis is presented but the text interpretation contradicts the numerical values in the table. The entire experiment operates on randomly generated data with no connection to real-world AI-native systems, making the findings uninformative about actual AI-integrated software engineering.",
    301   "red_flags": [
    302     {
    303       "flag": "Experiment on purely random data",
    304       "detail": "The entire experiment uses np.random.normal and np.random.choice to generate data. A Random Forest achieving 97% accuracy on classifying a randomly generated binary label from random features is either an artifact of overfitting or indicates the target variable was inadvertently correlated with features during generation. Either way, no real-world insight is produced."
    305     },
    306     {
    307       "flag": "Claims contradict reported numbers",
    308       "detail": "The text states 'user_behavior is the most important' feature, but Table 1 shows user_behavior has the lowest feature importance (0.175) while Sensor 1 has the highest (0.399). The abstract also names 'user behavior' as important."
    309     },
    310     {
    311       "flag": "Irrelevant references",
    312       "detail": "The 28 references cover topics like cryptocurrency, IoT, telehealth, and smart textiles. None are directly relevant to AI-native software engineering or Random Forest classification for decision-making systems."
    313     },
    314     {
    315       "flag": "No baselines or comparisons",
    316       "detail": "Only a single Random Forest model is tested. No comparison to other classifiers, naive baselines, or prior work."
    317     },
    318     {
    319       "flag": "Grandiose claims from trivial experiment",
    320       "detail": "The paper claims to demonstrate 'AI-Integrated Software Engineering' and 'systems that evolve with learning capabilities' based on a single Random Forest run on random data. The gap between claims and evidence is extreme."
    321     },
    322     {
    323       "flag": "Predatory journal indicators",
    324       "detail": "Published in JISEM, a journal with characteristics common to predatory publishers. The paper's quality — random data experiments presented as meaningful research, irrelevant references, internal contradictions — is consistent with minimal peer review."
    325     }
    326   ],
    327   "cited_papers": []
    328 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs