scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (31873B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Enhancing Android Malware Detection with Retrieval-Augmented Generation",
      6     "authors": [
      7       "S. Saraga",
      8       "S. AnaghaM.",
      9       "Dincy R. Arikkat",
     10       "A. RafidhaRehimanK.",
     11       "S. Nicolazzo"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2506.22750",
     16     "doi": "10.48550/arXiv.2506.22750"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The abstract claims 'improving detection accuracy over conventional feature-based methods for malware detection,' but the paper never compares against conventional feature-based methods. It only compares its own two description generation approaches. The contribution list claims 'outperforms state-of-the-art baselines' (Section 1) but no external SOTA baselines are tested.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Section 4.3.3 claims AgenticRAG's superiority is due to 'Enhanced Contextual Specificity,' 'Improved Edge Case Handling,' 'Semantic Coherence,' and 'Malware-Specific Terminology.' These are post-hoc explanations with no controlled experiments to isolate each factor. The paper uses causal language ('can be attributed to') without adequate causal design.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The conclusion claims the approach addresses 'increasingly complex challenges in security-critical applications' and mentions extension to 'network traffic analysis, vulnerability assessment, and threat intelligence' (Section 1). Results are on a single dataset from one source (AndroZoo) with one labeling scheme. No bounding of generalization to the tested setting.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No alternative explanations are considered for why AgenticRAG outperforms Gemini Fusion. The difference could be due to RAG grounding, the number of LLM calls, prompt design differences, or random variation, but none are explored.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper uses VirusTotal with a threshold of ≥1 antivirus detection as ground truth for 'malicious.' This is a proxy—a single AV detection may be a false positive—but the paper does not acknowledge this gap. Additionally, static-analysis-based classification is framed as 'malware detection' without discussing the gap between classification of APK descriptions and real-world malware detection.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No dedicated limitations or threats-to-validity section exists. The conclusion (Section 5) mentions future work directions (dynamic analysis, multi-platform) which implicitly acknowledge limitations, but there is no substantive discussion of limitations.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats to validity are discussed. The paper does not address potential issues such as the aggressive VirusTotal labeling threshold, dataset representativeness, temporal validity of malware labels, or the reliability of LLM-generated descriptions.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No explicit statements about what results do NOT show. The future work section mentions extending to dynamic analysis and other platforms, but does not explicitly bound the current claims to the tested setting.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding source, grants, or acknowledgments section is present in the paper.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All author affiliations are clearly listed on the first page: Cochin University of Science and Technology, University of Milan, and University of Pavia. No commercial product affiliations are relevant.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Since funding is not disclosed, independence of funder from outcome cannot be determined.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is included in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "AgenticRAG is partially defined architecturally but 'agentic' is used loosely; the labeling criterion for 'malicious' (flagged by ≥1 VirusTotal engine) is mentioned once without discussion of its implications.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three explicit contributions are bulleted in the introduction: dataset compilation, the AgenticRAG-based detection system, and experimental evaluation against an alternative approach.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The related work section traces the evolution from signature-based detection through CNNs, GNNs, and AppPoet, positioning this work relative to LLM-based approaches and identifying specific open challenges.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No code repository URL or archive is provided anywhere in the paper. The system components (AgenticRAG pipeline, BERT classifiers) are described but no source code is released.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The raw data source (AndroZoo) is publicly available, but the authors' specific compiled dataset of 10,000 benign and 8,000 malicious APKs, their VirusTotal labels, and their generated functional descriptions are not released. The Data Corpus of Android feature descriptions is also not released.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No environment specifications, requirements files, hardware descriptions, or library versions are provided. The paper mentions using Androguard, FAISS, BM25, HuggingFace, and various LLMs but specifies no versions or dependencies.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No reproduction instructions, README, or scripts are provided. A researcher would have to reconstruct the entire pipeline from the prose description.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Tables 5-8 are reported as point estimates only (e.g., '92.89% accuracy'). No confidence intervals, error bars, or uncertainty measures are reported.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper claims AgenticRAG outperforms Gemini Fusion and CySecBERT outperforms SecBERT based solely on comparing raw numbers (e.g., 92.89% vs 91.36%). No statistical significance tests are performed.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Tables 5-8 report absolute performance values for all compared systems (e.g., AgenticRAG 92.89% vs Gemini Fusion 91.36% accuracy), providing baseline context for interpreting differences across accuracy, precision, recall, and F1-score.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The dataset comprises 10,000 benign and 8,000 malicious samples, but no justification is given for why this size was chosen or whether it is sufficient for the claims made.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "All results appear to be single-run numbers. No standard deviations, variance across seeds, or multiple-run statistics are reported.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "The paper compares AgenticRAG vs Gemini Fusion (Table 7), CySecBERT vs SecBERT (Tables 5-6), and three fusion models (Gemini, Llama, Mistral) in Table 8, providing internal baselines.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "The paper cites AppPoet (2025) achieving 99.3% detection accuracy and HinDroid achieving 98.3%, but does not benchmark against any external prior methods on the same dataset. All comparisons are internal between the authors' own system variants.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "The system has multiple components (RAG retrieval, cache, fallback LLM querying, BERT classifier) but no ablation study isolates the contribution of individual components. The comparison between AgenticRAG and Gemini Fusion is between two different systems, not an ablation of one.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Four evaluation metrics are used throughout: accuracy, precision, recall, and F1-score (Section 4.2, Tables 5-8).",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "No human evaluation of the generated descriptions or classification outputs. All evaluation is automated via metrics on the test set.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Section 4.1 states 'the dataset is partitioned into training, validation, and testing subsets with a distribution ratio of 70:10:20' and 'comprehensive evaluation is performed on the segregated test set.'",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": false,
    218           "justification": "Results are only reported as aggregate binary classification (benign vs malicious). No per-malware-family, per-category, or per-difficulty breakdown is provided. Confusion matrices show TP/FP/FN/TN but no finer granularity.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "No failure cases, error analysis, or qualitative examples of misclassified samples are discussed. The paper only reports aggregate metrics.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Table 8 shows that Llama fusion (87.97% accuracy) and Mistral fusion (88.89%) substantially underperform Gemini fusion (91.36%), and that SecBERT underperforms CySecBERT on key metrics. These are configurations that were tried and found inferior.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "The paper uses 'Gemini 2.0 Flash Lite,' 'LLaMA2,' and 'Mistral' without snapshot dates or API versions. CySecBERT and SecBERT are referenced by HuggingFace links (footnotes 5-6) but no specific model checkpoint versions. Marketing names without version snapshots do not count as specified.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Tables 2, 3, and 4 provide the actual prompt text used for AgenticRAG description generation, Llama/Mistral description generation, and Gemini fusion respectively, including the full templates with variable placeholders and response format instructions.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No LLM inference hyperparameters (temperature, top-p, max tokens) are reported. BERT fine-tuning hyperparameters (learning rate, batch size, epochs, optimizer) are not specified. Only 'early stopping mechanism implemented based on validation loss trajectory' is mentioned with no details.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Section 3.3 and Figure 2 describe the AgenticRAG architecture in detail: feature normalization, FAISS + BM25 ensemble retrieval with configurable weights, advanced fuzzy matching (Levenshtein distance, 65% threshold), cache memory, fallback LLM querying, and structured output generation pipeline.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 3.4 documents NLP preprocessing (text cleaning, lowercasing, stopword removal, Porter stemming). Section 3.2 describes static feature extraction categories (permissions, receivers, services, intent actions). Section 3.1 describes the VirusTotal labeling protocol.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "AndroZoo is publicly available, but the authors' specific selection of 10,000 benign and 8,000 malicious APKs, their VirusTotal labels, the generated functional descriptions, and the Data Corpus are not released. Results cannot be independently verified.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 3.1 describes data acquisition from AndroZoo using SHA256 hashes cross-referenced with VirusTotal, with a threshold-based labeling criterion (≥1 detection = malicious, 0 detections = benign).",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. Data comes from AndroZoo, a standard public APK repository.",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": false,
    288           "justification": "While the overall pipeline stages are described (AndroZoo → VirusTotal labeling → static analysis → description generation → classification), key details are missing: how the specific 10,000 benign and 8,000 malicious samples were selected from AndroZoo's millions of APKs, what time period the APKs cover, and what filtering criteria were applied.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "The paper does not evaluate a pre-trained model's zero-shot capability on an existing benchmark. It fine-tunes BERT variants on a custom dataset of LLM-generated descriptions. The LLMs are used as feature-engineering tools, not evaluated on a benchmark.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": false,
    301           "answer": false,
    302           "justification": "Same as above—the paper fine-tunes classifiers on a custom dataset, not evaluating pre-trained model knowledge on an established benchmark.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "Same as above—no pre-trained model benchmark evaluation is being conducted.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference cost, latency, or token consumption is reported. The paper even notes AppPoet's 5-second inference limitation (Section 2) but does not report its own system's cost or latency.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No GPU hours, training time, API costs, or hardware specifications are provided. The system uses multiple LLM API calls (Gemini, LLaMA, Mistral) and fine-tunes BERT models, but no computational budget is stated.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "No mention of random seeds or results across multiple seeds. All results appear to be from single runs.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The number of experimental runs is never stated. Results are presented as single values without indication of how many runs produced them.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "No hyperparameter search is described. The fuzzy matching threshold of 65% and the 70:10:20 split appear chosen without justification, and no search budget is reported.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": false,
    392           "justification": "The paper compares multiple configurations (6 in Table 8) but does not explain how model selection was performed or whether validation performance guided the final configuration choice.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "Multiple comparisons are made across 6 model-classifier combinations (Table 8) plus additional comparisons in Tables 5-7, but no correction for multiple comparisons is applied.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "All baselines (Gemini Fusion, Llama Fusion, Mistral Fusion) are the authors' own implementations. No acknowledgment of self-evaluation bias, and no independent evaluation or external baselines.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "AgenticRAG involves retrieval (FAISS + BM25), cache lookups, and LLM calls, while Gemini Fusion uses multiple LLM calls. These likely have very different compute costs, but no comparison at matched compute budgets is provided.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "The paper does not discuss whether VirusTotal-based binary labeling (≥1 detection = malicious) actually measures malware detection capability. A single AV engine flagging an APK may be a false positive, and VirusTotal thresholds are known to be noisy.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": true,
    421           "answer": false,
    422           "justification": "AgenticRAG (retrieval + cache + LLM generation) vs Gemini Fusion (multi-LLM fusion) differ substantially in scaffolding. The paper attributes performance differences to the approach rather than the scaffolding confound.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "No discussion of the time period of the APK dataset or whether temporal ordering matters. The LLMs generating descriptions may have seen information about these APKs during pre-training.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "No discussion of whether the LLM-generated descriptions inadvertently encode label information or whether the prompts leak classification-relevant information beyond the static features.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "No discussion of whether training and test APKs may share structural similarities (same developers, same malware families, near-duplicate apps). The 70:10:20 split appears random without stratification by app family.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "No leakage detection or prevention method is used or discussed.",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "AgenticRAG achieves 92.89% accuracy and 96.69% recall with CySecBERT on the test set",
    457       "evidence": "Table 5 and 7 report these exact figures on the 20% held-out test set",
    458       "supported": "moderate"
    459     },
    460     {
    461       "claim": "AgenticRAG descriptions outperform Gemini Fusion descriptions for malware classification",
    462       "evidence": "Table 7 shows 92.89% vs 91.36% accuracy; however, no significance test is applied to a 1.5pp difference",
    463       "supported": "weak"
    464     },
    465     {
    466       "claim": "The proposed system outperforms state-of-the-art baselines in detection accuracy",
    467       "evidence": "No traditional ML or prior LLM-based malware detection baselines appear in the experiments; only the two proposed variants are compared",
    468       "supported": "unsupported"
    469     },
    470     {
    471       "claim": "RAG integration mitigates LLM hallucinations in the description generation process",
    472       "evidence": "The claim is asserted but no hallucination rate measurement or qualitative comparison of outputs is provided",
    473       "supported": "unsupported"
    474     },
    475     {
    476       "claim": "Gemini 2.0 Flash Lite is the best fusion model, outperforming Llama2 and Mistral",
    477       "evidence": "Table 8 shows Gemini Fusion achieving 91.36% accuracy vs Llama 87.97% and Mistral 88.89% with CySecBERT",
    478       "supported": "moderate"
    479     }
    480   ],
    481   "methodology_tags": [
    482     "benchmark-eval",
    483     "case-study"
    484   ],
    485   "key_findings": "The paper proposes AgenticRAG, a system combining RAG with an LLM (Gemini 2.0 Flash Lite) to generate functional text descriptions of Android APKs from static features, which are then classified by fine-tuned BERT variants. AgenticRAG descriptions achieve 92.89% accuracy and 96.69% recall with CySecBERT, outperforming a Gemini-fused LLaMA/Mistral baseline by ~1.5 percentage points in accuracy. However, the paper compares only LLM-based description generation variants against each other and not against any traditional ML or prior non-LLM baselines, making the claimed state-of-the-art improvement unverifiable. The entire evaluation rests on a single train/test split with no statistical significance testing.",
    486   "red_flags": [
    487     {
    488       "flag": "Missing baselines",
    489       "detail": "Abstract and introduction claim improvement over 'conventional feature-based methods' and 'state-of-the-art', but no traditional ML baselines (DREBIN, CNN, GNN) appear in any results table."
    490     },
    491     {
    492       "flag": "No statistical tests",
    493       "detail": "All comparative claims (AgenticRAG vs. Gemini Fusion, CySecBERT vs. SecBERT) are made without significance tests on differences as small as 1-2 percentage points from a single split."
    494     },
    495     {
    496       "flag": "Noisy ground truth",
    497       "detail": "Any single VirusTotal engine detection is used as the malicious label, which is known to produce high false-positive rates and is not discussed as a limitation."
    498     },
    499     {
    500       "flag": "Contamination unaddressed",
    501       "detail": "Gemini 2.0 Flash Lite generates descriptions from static features; its training cutoff and potential exposure to AndroZoo APK metadata or API patterns are never discussed."
    502     },
    503     {
    504       "flag": "No code or data release",
    505       "detail": "Neither the codebase nor the specific 18k-APK dataset subset is released, making independent reproduction impossible."
    506     },
    507     {
    508       "flag": "Unverified hallucination mitigation claim",
    509       "detail": "The abstract states RAG 'mitigates hallucinations' but no hallucination rate is measured or compared between RAG and non-RAG conditions."
    510     }
    511   ],
    512   "cited_papers": [
    513     {
    514       "title": "AppPoet: Large language model based android malware detection via multi-view prompt engineering",
    515       "relevance": "Direct prior work using LLMs for Android malware detection; primary baseline system discussed in related work"
    516     },
    517     {
    518       "title": "DREBIN: Effective and explainable detection of android malware in your pocket",
    519       "relevance": "Classic feature-based Android malware detection baseline that should have been included in experiments"
    520     },
    521     {
    522       "title": "HinDroid: An intelligent android malware detection system based on structured heterogeneous information network",
    523       "relevance": "GNN-based approach achieving 98.3% accuracy cited in related work as state-of-the-art prior to LLM era"
    524     },
    525     {
    526       "title": "CySecBERT: A domain-adapted language model for the cybersecurity domain",
    527       "relevance": "Primary classification model used in this paper; cybersecurity-specific BERT variant"
    528     },
    529     {
    530       "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena",
    531       "relevance": "Cited for position bias and verbosity effects in LLM evaluation, relevant to robustness of LLM-based approaches"
    532     },
    533     {
    534       "title": "FlowDroid: Precise context, flow, field, object-sensitive and lifecycle-aware taint analysis for Android apps",
    535       "relevance": "Foundational Android static analysis tool relevant to the feature extraction methodology"
    536     }
    537   ],
    538   "engagement_factors": {
    539     "practical_relevance": {
    540       "score": 2,
    541       "justification": "Android malware detection is practically relevant and the RAG-based approach could be applied by security practitioners, though no code or tool is released."
    542     },
    543     "surprise_contrarian": {
    544       "score": 0,
    545       "justification": "Confirms the expected result that RAG+LLM can improve classification; no conventional wisdom challenged."
    546     },
    547     "fear_safety": {
    548       "score": 1,
    549       "justification": "Addresses Android security but does not reveal novel attack vectors or raise new safety concerns."
    550     },
    551     "drama_conflict": {
    552       "score": 0,
    553       "justification": "No controversy, no critique of existing systems or companies."
    554     },
    555     "demo_ability": {
    556       "score": 0,
    557       "justification": "No code, demo, or installable tool is released."
    558     },
    559     "brand_recognition": {
    560       "score": 0,
    561       "justification": "From relatively unknown university labs; uses Gemini but is not from Google."
    562     }
    563   },
    564   "hn_data": {
    565     "threads": [
    566       {
    567         "hn_id": "44675438",
    568         "title": "A Photonic SRAM with Embedded XOR Logic for Ultra-Fast In-Memory Computing",
    569         "points": 57,
    570         "comments": 16,
    571         "url": "https://news.ycombinator.com/item?id=44675438"
    572       },
    573       {
    574         "hn_id": "43131809",
    575         "title": "Cache Is King: Smart Page Eviction with eBPF",
    576         "points": 8,
    577         "comments": 0,
    578         "url": "https://news.ycombinator.com/item?id=43131809"
    579       },
    580       {
    581         "hn_id": "43142367",
    582         "title": "Cache Is King: Smart Page Eviction with eBPF",
    583         "points": 5,
    584         "comments": 0,
    585         "url": "https://news.ycombinator.com/item?id=43142367"
    586       },
    587       {
    588         "hn_id": "42984804",
    589         "title": "Cache Is King: Smart Page Eviction with eBPF",
    590         "points": 2,
    591         "comments": 0,
    592         "url": "https://news.ycombinator.com/item?id=42984804"
    593       },
    594       {
    595         "hn_id": "45781713",
    596         "title": "Consequences of Undecidability in Physics on the Theory of Everything",
    597         "points": 1,
    598         "comments": 2,
    599         "url": "https://news.ycombinator.com/item?id=45781713"
    600       },
    601       {
    602         "hn_id": "44291959",
    603         "title": "Improving Brain-to-Image Reconstruction via Fine-Grained Text Bridging",
    604         "points": 1,
    605         "comments": 0,
    606         "url": "https://news.ycombinator.com/item?id=44291959"
    607       }
    608     ],
    609     "top_points": 57,
    610     "total_points": 74,
    611     "total_comments": 18
    612   }
    613 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs