scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29198B)
      1 {
      2   "paper": {
      3     "title": "Enhancing Android Malware Detection with Retrieval-Augmented Generation",
      4     "authors": [
      5       "Saraga S.",
      6       "Anagha M. S.",
      7       "Dincy R. Arikkat",
      8       "Rafidha Rehiman K. A.",
      9       "Serena Nicolazzo",
     10       "Antonino Nocera",
     11       "Vinod P."
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2506.22750",
     16     "doi": "10.48550/arXiv.2506.22750"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No code repository URL or archive is provided anywhere in the paper. The system components (AgenticRAG pipeline, BERT classifiers) are described but no source code is released."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The raw data source (AndroZoo) is publicly available, but the authors' specific compiled dataset of 10,000 benign and 8,000 malicious APKs, their VirusTotal labels, and their generated functional descriptions are not released. The Data Corpus of Android feature descriptions is also not released."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No environment specifications, requirements files, hardware descriptions, or library versions are provided. The paper mentions using Androguard, FAISS, BM25, HuggingFace, and various LLMs but specifies no versions or dependencies."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No reproduction instructions, README, or scripts are provided. A researcher would have to reconstruct the entire pipeline from the prose description."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Tables 5-8 are reported as point estimates only (e.g., '92.89% accuracy'). No confidence intervals, error bars, or uncertainty measures are reported."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims AgenticRAG outperforms Gemini Fusion and CySecBERT outperforms SecBERT based solely on comparing raw numbers (e.g., 92.89% vs 91.36%). No statistical significance tests are performed."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Tables 5-8 report absolute performance values for all compared systems (e.g., AgenticRAG 92.89% vs Gemini Fusion 91.36% accuracy), providing baseline context for interpreting differences across accuracy, precision, recall, and F1-score."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The dataset comprises 10,000 benign and 8,000 malicious samples, but no justification is given for why this size was chosen or whether it is sufficient for the claims made."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "All results appear to be single-run numbers. No standard deviations, variance across seeds, or multiple-run statistics are reported."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper compares AgenticRAG vs Gemini Fusion (Table 7), CySecBERT vs SecBERT (Tables 5-6), and three fusion models (Gemini, Llama, Mistral) in Table 8, providing internal baselines."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "The paper cites AppPoet (2025) achieving 99.3% detection accuracy and HinDroid achieving 98.3%, but does not benchmark against any external prior methods on the same dataset. All comparisons are internal between the authors' own system variants."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "The system has multiple components (RAG retrieval, cache, fallback LLM querying, BERT classifier) but no ablation study isolates the contribution of individual components. The comparison between AgenticRAG and Gemini Fusion is between two different systems, not an ablation of one."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Four evaluation metrics are used throughout: accuracy, precision, recall, and F1-score (Section 4.2, Tables 5-8)."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation of the generated descriptions or classification outputs. All evaluation is automated via metrics on the test set."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Section 4.1 states 'the dataset is partitioned into training, validation, and testing subsets with a distribution ratio of 70:10:20' and 'comprehensive evaluation is performed on the segregated test set.'"
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "Results are only reported as aggregate binary classification (benign vs malicious). No per-malware-family, per-category, or per-difficulty breakdown is provided. Confusion matrices show TP/FP/FN/TN but no finer granularity."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "No failure cases, error analysis, or qualitative examples of misclassified samples are discussed. The paper only reports aggregate metrics."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Table 8 shows that Llama fusion (87.97% accuracy) and Mistral fusion (88.89%) substantially underperform Gemini fusion (91.36%), and that SecBERT underperforms CySecBERT on key metrics. These are configurations that were tried and found inferior."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The abstract claims 'improving detection accuracy over conventional feature-based methods for malware detection,' but the paper never compares against conventional feature-based methods. It only compares its own two description generation approaches. The contribution list claims 'outperforms state-of-the-art baselines' (Section 1) but no external SOTA baselines are tested."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "Section 4.3.3 claims AgenticRAG's superiority is due to 'Enhanced Contextual Specificity,' 'Improved Edge Case Handling,' 'Semantic Coherence,' and 'Malware-Specific Terminology.' These are post-hoc explanations with no controlled experiments to isolate each factor. The paper uses causal language ('can be attributed to') without adequate causal design."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The conclusion claims the approach addresses 'increasingly complex challenges in security-critical applications' and mentions extension to 'network traffic analysis, vulnerability assessment, and threat intelligence' (Section 1). Results are on a single dataset from one source (AndroZoo) with one labeling scheme. No bounding of generalization to the tested setting."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "No alternative explanations are considered for why AgenticRAG outperforms Gemini Fusion. The difference could be due to RAG grounding, the number of LLM calls, prompt design differences, or random variation, but none are explored."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper uses VirusTotal with a threshold of ≥1 antivirus detection as ground truth for 'malicious.' This is a proxy—a single AV detection may be a false positive—but the paper does not acknowledge this gap. Additionally, static-analysis-based classification is framed as 'malware detection' without discussing the gap between classification of APK descriptions and real-world malware detection."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The paper uses 'Gemini 2.0 Flash Lite,' 'LLaMA2,' and 'Mistral' without snapshot dates or API versions. CySecBERT and SecBERT are referenced by HuggingFace links (footnotes 5-6) but no specific model checkpoint versions. Marketing names without version snapshots do not count as specified."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Tables 2, 3, and 4 provide the actual prompt text used for AgenticRAG description generation, Llama/Mistral description generation, and Gemini fusion respectively, including the full templates with variable placeholders and response format instructions."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "No LLM inference hyperparameters (temperature, top-p, max tokens) are reported. BERT fine-tuning hyperparameters (learning rate, batch size, epochs, optimizer) are not specified. Only 'early stopping mechanism implemented based on validation loss trajectory' is mentioned with no details."
    157       },
    158       "scaffolding_described": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 3.3 and Figure 2 describe the AgenticRAG architecture in detail: feature normalization, FAISS + BM25 ensemble retrieval with configurable weights, advanced fuzzy matching (Levenshtein distance, 65% threshold), cache memory, fallback LLM querying, and structured output generation pipeline."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 3.4 documents NLP preprocessing (text cleaning, lowercasing, stopword removal, Porter stemming). Section 3.2 describes static feature extraction categories (permissions, receivers, services, intent actions). Section 3.1 describes the VirusTotal labeling protocol."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No dedicated limitations or threats-to-validity section exists. The conclusion (Section 5) mentions future work directions (dynamic analysis, multi-platform) which implicitly acknowledge limitations, but there is no substantive discussion of limitations."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No threats to validity are discussed. The paper does not address potential issues such as the aggressive VirusTotal labeling threshold, dataset representativeness, temporal validity of malware labels, or the reliability of LLM-generated descriptions."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No explicit statements about what results do NOT show. The future work section mentions extending to dynamic analysis and other platforms, but does not explicitly bound the current claims to the tested setting."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "AndroZoo is publicly available, but the authors' specific selection of 10,000 benign and 8,000 malicious APKs, their VirusTotal labels, the generated functional descriptions, and the Data Corpus are not released. Results cannot be independently verified."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 3.1 describes data acquisition from AndroZoo using SHA256 hashes cross-referenced with VirusTotal, with a threshold-based labeling criterion (≥1 detection = malicious, 0 detections = benign)."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. Data comes from AndroZoo, a standard public APK repository."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "While the overall pipeline stages are described (AndroZoo → VirusTotal labeling → static analysis → description generation → classification), key details are missing: how the specific 10,000 benign and 8,000 malicious samples were selected from AndroZoo's millions of APKs, what time period the APKs cover, and what filtering criteria were applied."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding source, grants, or acknowledgments section is present in the paper."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "All author affiliations are clearly listed on the first page: Cochin University of Science and Technology, University of Milan, and University of Pavia. No commercial product affiliations are relevant."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "Since funding is not disclosed, independence of funder from outcome cannot be determined."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is included in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "The paper does not evaluate a pre-trained model's zero-shot capability on an existing benchmark. It fine-tunes BERT variants on a custom dataset of LLM-generated descriptions. The LLMs are used as feature-engineering tools, not evaluated on a benchmark."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Same as above—the paper fine-tunes classifiers on a custom dataset, not evaluating pre-trained model knowledge on an established benchmark."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "Same as above—no pre-trained model benchmark evaluation is being conducted."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No inference cost, latency, or token consumption is reported. The paper even notes AppPoet's 5-second inference limitation (Section 2) but does not report its own system's cost or latency."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No GPU hours, training time, API costs, or hardware specifications are provided. The system uses multiple LLM API calls (Gemini, LLaMA, Mistral) and fine-tunes BERT models, but no computational budget is stated."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No mention of random seeds or results across multiple seeds. All results appear to be from single runs."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The number of experimental runs is never stated. Results are presented as single values without indication of how many runs produced them."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No hyperparameter search is described. The fuzzy matching threshold of 65% and the 70:10:20 split appear chosen without justification, and no search budget is reported."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The paper compares multiple configurations (6 in Table 8) but does not explain how model selection was performed or whether validation performance guided the final configuration choice."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Multiple comparisons are made across 6 model-classifier combinations (Table 8) plus additional comparisons in Tables 5-7, but no correction for multiple comparisons is applied."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "All baselines (Gemini Fusion, Llama Fusion, Mistral Fusion) are the authors' own implementations. No acknowledgment of self-evaluation bias, and no independent evaluation or external baselines."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "AgenticRAG involves retrieval (FAISS + BM25), cache lookups, and LLM calls, while Gemini Fusion uses multiple LLM calls. These likely have very different compute costs, but no comparison at matched compute budgets is provided."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper does not discuss whether VirusTotal-based binary labeling (≥1 detection = malicious) actually measures malware detection capability. A single AV engine flagging an APK may be a false positive, and VirusTotal thresholds are known to be noisy."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "AgenticRAG (retrieval + cache + LLM generation) vs Gemini Fusion (multi-LLM fusion) differ substantially in scaffolding. The paper attributes performance differences to the approach rather than the scaffolding confound."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of the time period of the APK dataset or whether temporal ordering matters. The LLMs generating descriptions may have seen information about these APKs during pre-training."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the LLM-generated descriptions inadvertently encode label information or whether the prompts leak classification-relevant information beyond the static features."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether training and test APKs may share structural similarities (same developers, same malware families, near-duplicate apps). The 70:10:20 split appears random without stratification by app family."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No leakage detection or prevention method is used or discussed."
    363       }
    364     }
    365   },
    366   "scan_version": 3,
    367   "active_modules": [
    368     "experimental_rigor",
    369     "data_leakage"
    370   ],
    371   "claims": [
    372     {
    373       "claim": "AgenticRAG-generated descriptions achieve 92.89% accuracy and 92.86% F1-score for malware classification using CySecBERT, outperforming Gemini Fusion (91.36% accuracy, 91.25% F1-score).",
    374       "evidence": "Tables 5 and 7 (Section 4.3) show direct comparison. AgenticRAG achieves higher recall (96.69% vs 90.50%) and F1-score (92.86% vs 91.25%), though precision is lower (88.40% vs 90.07%).",
    375       "supported": "weak"
    376     },
    377     {
    378       "claim": "CySecBERT outperforms SecBERT for malware classification due to its specialized cybersecurity pre-training.",
    379       "evidence": "Tables 5-6 show CySecBERT achieves higher recall (96.69% vs 93.00% for AgenticRAG; 90.50% vs 89.25% for Gemini Fusion) and higher F1-score in both settings. However, SecBERT achieves higher accuracy (93.31% vs 92.89%) in the AgenticRAG setting.",
    380       "supported": "weak"
    381     },
    382     {
    383       "claim": "Gemini 2.0 Flash Lite is the best fusion model, outperforming Llama2 and Mistral across all evaluation metrics.",
    384       "evidence": "Table 8 shows Gemini fusion at 91.36% accuracy vs Llama at 87.97% and Mistral at 88.89% when classified with CySecBERT. Consistent pattern with SecBERT.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "The proposed system outperforms state-of-the-art baselines in detection accuracy.",
    389       "evidence": "Section 1 contribution list claims this, but no external state-of-the-art methods are tested. The paper cites AppPoet at 99.3% accuracy (Section 2) which substantially exceeds their 92.89%.",
    390       "supported": "unsupported"
    391     },
    392     {
    393       "claim": "RAG integration mitigates LLM hallucinations in description generation.",
    394       "evidence": "The abstract states RAG enables 'the LLM to ground its output in relevant context' but no quantitative comparison of hallucination rates with vs without RAG is provided.",
    395       "supported": "unsupported"
    396     }
    397   ],
    398   "methodology_tags": [
    399     "benchmark-eval"
    400   ],
    401   "key_findings": "The paper proposes AgenticRAG for Android malware detection, combining FAISS+BM25 retrieval with Gemini 2.0 Flash Lite to generate functional descriptions of APKs from static features. These descriptions are classified by fine-tuned CySecBERT, achieving 92.89% accuracy on an 18,000-sample dataset from AndroZoo. AgenticRAG marginally outperforms a Gemini Fusion alternative (91.36%), and Gemini outperforms Llama2 and Mistral as fusion models. However, no external state-of-the-art baselines are tested, no statistical significance tests are performed, and the claimed advantages lack controlled experimental support.",
    402   "red_flags": [
    403     {
    404       "flag": "Unsupported SOTA claims",
    405       "detail": "The paper claims to 'outperform state-of-the-art baselines' (Section 1) but tests no external baselines. The cited AppPoet achieves 99.3% accuracy (Section 2), far exceeding the proposed system's 92.89%. All comparisons are between the authors' own system variants."
    406     },
    407     {
    408       "flag": "No statistical significance testing",
    409       "detail": "Performance differences between systems (e.g., 92.89% vs 91.36%) are small and presented without any significance tests, error bars, or multiple-run statistics. These differences could easily be within random variation."
    410     },
    411     {
    412       "flag": "Questionable ground truth labeling",
    413       "detail": "Applications with ≥1 VirusTotal detection (out of 60+ engines) are labeled malicious. A single antivirus flagging is extremely noisy—false positive rates for individual AV engines are well-documented. This is not discussed."
    414     },
    415     {
    416       "flag": "No error analysis or failure cases",
    417       "detail": "The paper reports only aggregate metrics without examining what types of malware are missed, what benign apps are misclassified, or why. The 7.11% error rate is uncharacterized."
    418     },
    419     {
    420       "flag": "Overclaiming in abstract and conclusion",
    421       "detail": "The abstract claims 'improving detection accuracy over conventional feature-based methods' without testing any conventional methods. The conclusion extrapolates to 'network traffic analysis, vulnerability assessment, and threat intelligence' from a single Android classification experiment."
    422     },
    423     {
    424       "flag": "Single-run results",
    425       "detail": "All results appear to be from single experimental runs with no variance reporting. BERT fine-tuning is sensitive to random initialization and data ordering, making single-run comparisons unreliable."
    426     }
    427   ],
    428   "cited_papers": [
    429     {
    430       "title": "AppPoet: Large Language Model based Android Malware Detection via Multi-view Prompt Engineering",
    431       "authors": ["Wenxiang Zhao", "Juntao Wu", "Zhaoyi Meng"],
    432       "year": 2025,
    433       "relevance": "Directly relevant as LLM-based malware detection achieving 99.3% accuracy via multi-view prompt engineering, the key prior work in this space."
    434     },
    435     {
    436       "title": "CySecBERT: A Domain-Adapted Language Model for the Cybersecurity Domain",
    437       "authors": ["Markus Bayer", "Philipp Kuehn", "Ramin Shanehsaz", "Christian Reuter"],
    438       "year": 2024,
    439       "relevance": "Domain-adapted BERT model for cybersecurity used as the primary classifier; demonstrates effectiveness of domain-specific fine-tuning for security NLP tasks."
    440     },
    441     {
    442       "title": "SecBERT: Privacy-Preserving Pre-Training Based Neural Network Inference System",
    443       "authors": ["Hai Huang", "Yongjian Wang"],
    444       "year": 2024,
    445       "relevance": "Security-oriented BERT variant used as comparative classifier in the malware detection pipeline."
    446     },
    447     {
    448       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    449       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    450       "year": 2023,
    451       "relevance": "Documents position-dependent bias and verbosity effects in LLM-based evaluation, relevant to understanding LLM limitations in automated assessment."
    452     },
    453     {
    454       "title": "Summary of ChatGPT-Related Research and Perspective Towards the Future of Large Language Models",
    455       "authors": ["Yiheng Liu"],
    456       "year": 2023,
    457       "relevance": "Overview of LLM research trends including how LLMs are transforming various research approaches."
    458     },
    459     {
    460       "title": "Agentic Information Retrieval",
    461       "authors": ["Weinan Zhang", "Junwei Liao", "Ning Li"],
    462       "year": 2025,
    463       "relevance": "Foundational work on combining RAG with agentic behavior, the core architectural concept used in this paper's AgenticRAG system."
    464     },
    465     {
    466       "title": "Yes, Machine Learning Can Be More Secure! A Case Study on Android Malware Detection",
    467       "authors": ["Ambra Demontis", "Marco Melis", "Battista Biggio"],
    468       "year": 2017,
    469       "relevance": "Demonstrates ML-based android malware detection with adversarial robustness considerations, relevant to understanding evasion resilience."
    470     },
    471     {
    472       "title": "Drebin: Effective and Explainable Detection of Android Malware in Your Pocket",
    473       "authors": ["Daniel Arp", "Michael Spreitzenbarth", "Malte Hubner"],
    474       "year": 2014,
    475       "relevance": "Seminal work on explainable Android malware detection using static features, foundational to the feature-based approach used in this paper."
    476     }
    477   ],
    478   "engagement_factors": {
    479     "practical_relevance": {
    480       "score": 2,
    481       "justification": "Android malware detection is practically relevant and the RAG-based approach could be applied by security practitioners, though no code or tool is released."
    482     },
    483     "surprise_contrarian": {
    484       "score": 0,
    485       "justification": "Confirms the expected result that RAG+LLM can improve classification; no conventional wisdom challenged."
    486     },
    487     "fear_safety": {
    488       "score": 1,
    489       "justification": "Addresses Android security but does not reveal novel attack vectors or raise new safety concerns."
    490     },
    491     "drama_conflict": {
    492       "score": 0,
    493       "justification": "No controversy, no critique of existing systems or companies."
    494     },
    495     "demo_ability": {
    496       "score": 0,
    497       "justification": "No code, demo, or installable tool is released."
    498     },
    499     "brand_recognition": {
    500       "score": 0,
    501       "justification": "From relatively unknown university labs; uses Gemini but is not from Google."
    502     }
    503   }
    504 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs