scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26335B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Enhancing Android Malware Detection with Retrieval-Augmented Generation",
      6     "authors": [
      7       "S. Saraga",
      8       "S. AnaghaM.",
      9       "Dincy R. Arikkat",
     10       "A. RafidhaRehimanK.",
     11       "S. Nicolazzo"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2506.22750",
     16     "doi": "10.48550/arXiv.2506.22750"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The abstract claims improvement 'over conventional feature-based methods' but the experimental section only compares two description generation approaches (AgenticRAG vs Gemini Fusion); no direct comparison to raw feature-based classifiers is performed.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper attributes performance differences to specific AgenticRAG properties (enhanced contextual specificity, semantic coherence) but the comparison is not an ablation that isolates these factors; Gemini Fusion uses different underlying models, confounding the causal attribution.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The conclusion claims the approach addresses 'increasingly complex challenges in security-critical applications' broadly, but results are from a single AndroZoo snapshot with no discussion of temporal drift, malware family diversity, or applicability outside this dataset.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Performance differences between AgenticRAG and Gemini Fusion are attributed to AgenticRAG properties without considering alternatives such as differences in the underlying LLMs used for generation, dataset characteristics, or label noise from the VirusTotal threshold.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper uses accuracy/F1 on a held-out slice of the same AndroZoo distribution as a proxy for real-world malware detection capability, without distinguishing this from deployment performance against obfuscated, polymorphic, or zero-day variants.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations or threats-to-validity section; the conclusion frames gaps only as 'future work directions' (adding dynamic analysis, multi-OS coverage) rather than honest acknowledgment of current limitations.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats to validity are discussed at all — not the VirusTotal labeling threshold, dataset temporal skew, class imbalance handling, or single-run evaluation without variance estimation.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper makes no explicit statement about what its results do not show; generalization to other malware families, time periods, or operating systems is left unaddressed.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding source is mentioned anywhere in the paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All author affiliations are clearly listed on the title page (Cochin University of Science and Technology, University of Milan, University of Pavia).",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding is disclosed, making this criterion not assessable.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "There is no competing interests or financial disclosure statement in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are adequately defined: 'AgenticRAG' is described in detail in Section 3.3, 'static analysis' is contrasted with dynamic analysis, and BERT variants are introduced with references.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 1 explicitly lists three contributions: dataset compilation with static analysis, the AgenticRAG-based detection system, and an extensive experimental evaluation.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 traces the evolution from signature-based to deep learning approaches and explicitly positions the work relative to AppPoet (a direct predecessor using multi-view prompt engineering for Android malware detection).",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No code repository or link is provided; the paper describes the system architecture but makes no mention of a public release.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The paper uses a custom slice from AndroZoo but does not release the specific APK list, SHA256 hashes, extracted features, or generated descriptions used in experiments.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements file, Dockerfile, or dependency list is provided; specific library versions for HuggingFace, FAISS, BM25, Androguard, or the training framework are not stated.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided; the paper describes the pipeline conceptually but gives insufficient detail to reproduce experiments without guessing.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results are reported as single point estimates (e.g., accuracy 92.89%); no confidence intervals or error bars are provided for any metric.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied to any of the comparative results; differences between AgenticRAG and Gemini Fusion are presented without testing whether they exceed noise.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Absolute performance differences are reported (e.g., recall 96.69% vs 90.50%), giving the reader enough context to assess practical magnitude even without formal effect-size measures.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The choice of 10,000 benign and 8,000 malicious samples is not justified; no power analysis or discussion of why this size is sufficient is provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Results are from single training runs; no variance across seeds, folds, or repeated runs is reported.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "The only comparison is between two LLM-based description generation approaches; the abstract claims improvement over 'conventional feature-based methods' but no such baseline is included in the experiments.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Gemini 2.0 Flash Lite, LLaMA2, and Mistral are all contemporary models used as the comparative baselines within the paper's scope.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "There is no ablation study isolating individual components (e.g., the RAG retrieval vs. the LLM generation vs. the agentic planning); comparisons are between full systems, not components.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Four metrics are reported: accuracy, precision, recall, and F1-score, across all experimental conditions.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "Human evaluation is not relevant for an automated malware classification task where ground truth labels come from VirusTotal.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "The dataset is split 70:10:20 with a held-out test set, and results are reported on the test partition.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": false,
    218           "justification": "No breakdown by malware family, permission category, or any other subgroup is provided; all results are aggregate binary classification metrics.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "Failure cases are not discussed; confusion matrices are shown but no qualitative analysis of misclassified samples is provided.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": false,
    230           "justification": "The paper does not report any negative results; the brief note that CySecBERT didn't beat SecBERT on every metric is presented as confirmation of CySecBERT's selection, not as a substantive negative finding.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Gemini 2.0 Flash Lite is named, but LLaMA2 and Mistral versions are not specified (no model size, quantization level, or snapshot date); CySecBERT and SecBERT have HuggingFace links but no version pinning.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Full prompts for AgenticRAG (Table 2), LLaMA/Mistral generation (Table 3), and Gemini fusion (Table 4) are provided with template placeholders whose fill values are described.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No training hyperparameters are reported (learning rate, batch size, number of epochs, optimizer); only that early stopping on validation loss and class weighting are used.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The AgenticRAG pipeline is described in substantial detail: feature normalization, FAISS+BM25 ensemble retrieval, fuzzy matching with Levenshtein distance, fallback LLM querying, and cache memory are all explained.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 3.4 documents NLP preprocessing steps: text cleaning, lowercasing, stopword removal (with examples), and Porter stemming applied before classification.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Raw data (the specific APK subset, extracted features, or generated descriptions) is not released; only the source repository (AndroZoo) is referenced.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 3.1 describes the collection protocol: AndroZoo as source, SHA256 cross-referencing, VirusTotal API for labeling, and the threshold criterion (≥1 engine detection = malicious).",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; samples are drawn from a public APK repository.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": false,
    288           "justification": "The pipeline from AndroZoo download to VirusTotal labeling to Androguard feature extraction is described, but the date range of APKs, deduplication procedure, and version-specific filtering are not documented.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "The training data cutoff of Gemini 2.0 Flash Lite (used for description generation) is not stated; CySecBERT and SecBERT pre-training cutoffs are also not mentioned.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of whether the AndroZoo APKs used for evaluation may have been present in pre-training corpora for Gemini 2.0 Flash Lite or the BERT variants.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "The dataset is constructed from AndroZoo without verifying whether those APKs or their metadata appear in the training data of any of the LLMs used.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "The paper notes in related work that prior LLM-based approaches exceed 5 seconds per application, but does not report inference latency or cost for the proposed AgenticRAG system itself.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No GPU hours, API call counts, or total computational budget for training or evaluation are reported.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "AgenticRAG achieves 92.89% accuracy and 96.69% recall, outperforming Gemini Fusion (91.36% accuracy, 90.50% recall) on malware classification.",
    375       "evidence": "Table 7 and Table 5 show direct comparisons of the two description generation approaches using CySecBERT classifier on the same held-out test set.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "CySecBERT outperforms SecBERT for malware detection tasks due to its cybersecurity-domain pre-training.",
    380       "evidence": "Tables 5 and 6 show CySecBERT with higher recall and F1 than SecBERT in both description settings, but the margin is small and no significance test is run.",
    381       "supported": "weak"
    382     },
    383     {
    384       "claim": "Gemini 2.0 Flash Lite is the best fusion model, outperforming LLaMA2 and Mistral across all metrics.",
    385       "evidence": "Table 8 shows Gemini fusion achieving 91.36% accuracy vs 87.97% (LLaMA) and 88.89% (Mistral) with CySecBERT; result is consistent but based on a single run without variance.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "The proposed approach improves malware detection accuracy over conventional feature-based methods.",
    390       "evidence": "This claim is made in the abstract and conclusion but no direct comparison against feature-based baselines (e.g., Drebin, DroidEcho) is performed in the experimental section.",
    391       "supported": "unsupported"
    392     },
    393     {
    394       "claim": "RAG mitigates LLM hallucinations in generating application functional descriptions.",
    395       "evidence": "This is stated as a design motivation but no hallucination rate or factual accuracy metric is measured; the claim is qualitative and unverified.",
    396       "supported": "unsupported"
    397     }
    398   ],
    399   "methodology_tags": [
    400     "benchmark-eval"
    401   ],
    402   "key_findings": "The paper proposes an Android malware detection pipeline that uses AgenticRAG to convert static APK features into natural language descriptions, which are then classified by fine-tuned BERT variants. AgenticRAG-generated descriptions yield 92.89% accuracy and 96.69% recall versus 91.36% and 90.50% for a Gemini Fusion alternative on 18,000 APKs from AndroZoo. Gemini 2.0 Flash Lite outperforms LLaMA2 and Mistral as a fusion model. However, no direct comparison to non-LLM baselines is performed, no statistical significance is tested, and the claimed improvement over 'conventional feature-based methods' is not empirically validated.",
    403   "red_flags": [
    404     {
    405       "flag": "Missing baseline comparison",
    406       "detail": "Abstract and conclusion claim improvement over 'conventional feature-based methods' but no such baseline appears in the experimental section; the only comparison is between two LLM-based approaches."
    407     },
    408     {
    409       "flag": "No statistical tests or confidence intervals",
    410       "detail": "All comparative claims are based on single-run point estimates; differences of 1-5pp between methods are presented without significance testing, making it impossible to assess whether differences exceed noise."
    411     },
    412     {
    413       "flag": "No code or data release",
    414       "detail": "The system is fully proprietary; no code, feature extraction scripts, APK lists, or generated descriptions are released, making reproduction impossible."
    415     },
    416     {
    417       "flag": "No limitations section",
    418       "detail": "There is no dedicated limitations or threats-to-validity section; gaps are framed exclusively as future work, not as limitations of the current results."
    419     },
    420     {
    421       "flag": "VirusTotal labeling threshold noise",
    422       "detail": "Any single antivirus engine flagging an APK labels it malicious; this one-engine threshold is conservative but known to generate false positives and is not discussed as a threat to ground-truth validity."
    423     },
    424     {
    425       "flag": "Dataset temporality undisclosed",
    426       "detail": "The time range and collection date of the AndroZoo APKs are not reported, making it impossible to assess dataset freshness or whether train/test samples share malware family distributions."
    427     },
    428     {
    429       "flag": "Stopword removal and stemming applied to BERT input",
    430       "detail": "Section 3.4 applies stopword removal and Porter stemming before feeding text to BERT — preprocessing steps that are generally harmful for transformer models and may have suppressed performance."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "AppPoet: Large Language Model Based Android Malware Detection via Multi-View Prompt Engineering",
    436       "relevance": "Direct predecessor using LLMs for Android malware detection; the paper explicitly positions itself relative to AppPoet's multi-view prompt engineering approach."
    437     },
    438     {
    439       "title": "Drebin: Effective and Explainable Detection of Android Malware in Your Pocket",
    440       "relevance": "Classic feature-based Android malware detection baseline using static permissions and intents; foundational work in the area."
    441     },
    442     {
    443       "title": "HinDroid: An Intelligent Android Malware Detection System Based on Structured Heterogeneous Information Network",
    444       "relevance": "GNN-based approach achieving 98.3% detection accuracy; represents the state-of-the-art deep learning baseline discussed in related work."
    445     },
    446     {
    447       "title": "CySecBERT: A Domain-Adapted Language Model for the Cybersecurity Domain",
    448       "relevance": "The primary classification model used; cybersecurity-domain BERT variant central to the paper's methodology."
    449     },
    450     {
    451       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    452       "relevance": "Used to discuss position bias and verbosity effects in LLM-based systems; motivates design choices in evaluation methodology."
    453     },
    454     {
    455       "title": "DroidScope: Seamlessly Reconstructing the OS and Dalvik Semantic Views for Dynamic Android Malware Analysis",
    456       "relevance": "Represents the dynamic analysis line of work that the paper's static analysis approach is contrasted against."
    457     },
    458     {
    459       "title": "FlowDroid: Precise Context, Flow, Field, Object-Sensitive and Lifecycle-Aware Taint Analysis for Android Apps",
    460       "relevance": "Foundational static taint analysis work for Android security context."
    461     }
    462   ],
    463   "engagement_factors": {
    464     "practical_relevance": {
    465       "score": 2,
    466       "justification": "Android malware detection is a real and growing problem affecting billions of devices; the RAG-based approach is potentially deployable."
    467     },
    468     "surprise_contrarian": {
    469       "score": 1,
    470       "justification": "Using RAG to generate textual APK descriptions before classification is a novel framing but builds predictably on existing LLM-for-security trends."
    471     },
    472     "fear_safety": {
    473       "score": 2,
    474       "justification": "Android malware threatening user privacy and financial security is a genuine concern with clear user impact."
    475     },
    476     "drama_conflict": {
    477       "score": 0,
    478       "justification": "No controversy or conflict angle; straightforward system paper."
    479     },
    480     "demo_ability": {
    481       "score": 1,
    482       "justification": "AndroZoo is publicly accessible so the dataset can be obtained, but no code is released making replication difficult."
    483     },
    484     "brand_recognition": {
    485       "score": 0,
    486       "justification": "Authors are from Cochin University of Science and Technology and Italian universities; no famous lab or industrial affiliation."
    487     }
    488   },
    489   "hn_data": {
    490     "threads": [
    491       {
    492         "hn_id": "44675438",
    493         "title": "A Photonic SRAM with Embedded XOR Logic for Ultra-Fast In-Memory Computing",
    494         "points": 57,
    495         "comments": 16,
    496         "url": "https://news.ycombinator.com/item?id=44675438"
    497       },
    498       {
    499         "hn_id": "43131809",
    500         "title": "Cache Is King: Smart Page Eviction with eBPF",
    501         "points": 8,
    502         "comments": 0,
    503         "url": "https://news.ycombinator.com/item?id=43131809"
    504       },
    505       {
    506         "hn_id": "43142367",
    507         "title": "Cache Is King: Smart Page Eviction with eBPF",
    508         "points": 5,
    509         "comments": 0,
    510         "url": "https://news.ycombinator.com/item?id=43142367"
    511       },
    512       {
    513         "hn_id": "42984804",
    514         "title": "Cache Is King: Smart Page Eviction with eBPF",
    515         "points": 2,
    516         "comments": 0,
    517         "url": "https://news.ycombinator.com/item?id=42984804"
    518       },
    519       {
    520         "hn_id": "45781713",
    521         "title": "Consequences of Undecidability in Physics on the Theory of Everything",
    522         "points": 1,
    523         "comments": 2,
    524         "url": "https://news.ycombinator.com/item?id=45781713"
    525       },
    526       {
    527         "hn_id": "44291959",
    528         "title": "Improving Brain-to-Image Reconstruction via Fine-Grained Text Bridging",
    529         "points": 1,
    530         "comments": 0,
    531         "url": "https://news.ycombinator.com/item?id=44291959"
    532       }
    533     ],
    534     "top_points": 57,
    535     "total_points": 74,
    536     "total_comments": 18
    537   }
    538 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs