scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20566B)
      1 {
      2   "paper": {
      3     "title": "CyberBOT: Towards Reliable Cybersecurity Education via Ontology-Grounded Retrieval Augmented Generation",
      4     "authors": ["Chengshuai Zhao", "Riccardo De Maria", "Tharindu Kumarage", "Kumar Satvik Chaudhary", "Garima Agrawal", "Yiwen Li", "Jongchan Park", "Yuli Deng", "Ying-Chih Chen", "Huan Liu"],
      5     "year": 2025,
      6     "venue": "Arizona State University"
      7   },
      8   "checklist": {
      9     "artifacts": {
     10       "code_released": {
     11         "applies": true,
     12         "answer": true,
     13         "justification": "Figure 4 shows a 'Quick Web App Setup' section with step-by-step deployment instructions including cloning a repository. The paper references a footnote for CyberBOT and shows deployment commands."
     14       },
     15       "data_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The evaluation uses CyberQ (Agrawal et al., 2024b), described as 'an open-source dataset comprising approx 3,500 open-ended cybersecurity QA pairs.' The course-specific knowledge base is not released."
     19       },
     20       "environment_specified": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The paper mentions Docker, A100 80GB GPU, Together AI API, Streamlit, SQLite, LangChain, and LlamaIndex, but no requirements.txt, Dockerfile contents, or detailed dependency versions are provided."
     24       },
     25       "reproduction_instructions": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "Figure 4 shows a quick setup guide screenshot but the actual commands and steps are not readable in the paper text. No detailed reproduction instructions for the experiments are provided."
     29       }
     30     },
     31     "statistical_methodology": {
     32       "confidence_intervals_or_error_bars": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Table 1 reports only point estimates (e.g., BERTScore 0.933) with no confidence intervals or error bars."
     36       },
     37       "significance_tests": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper makes comparative claims across question categories (ZS, FS, OD) but provides no statistical significance tests. The planned ANCOVA for the field study has not yet been conducted."
     41       },
     42       "effect_sizes_reported": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No effect sizes are reported. Results are presented as raw metric scores without baseline context or magnitude of differences."
     46       },
     47       "sample_size_justified": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The quasi-experimental study uses 77 students (39 experimental, 38 control) but no power analysis or justification for this sample size is provided."
     51       },
     52       "variance_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No variance, standard deviation, or spread measures are reported for the computational evaluation results in Table 1."
     56       }
     57     },
     58     "evaluation_design": {
     59       "baselines_included": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No baseline comparisons are included. Table 1 reports only CyberBOT's own metrics without comparing to any alternative system, vanilla LLM, or prior QA approach."
     63       },
     64       "baselines_contemporary": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No baselines are included at all, so contemporaneity cannot be assessed."
     68       },
     69       "ablation_study": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "CyberBOT has multiple components (intent interpreter, RAG retriever, ontology verifier) but no ablation study is conducted to measure their individual contributions."
     73       },
     74       "multiple_metrics": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper uses multiple metrics: BERTScore, METEOR, ROUGE-1, ROUGE-2 (QA-based) and Faithfulness, Answer Relevancy, Context Precision, Context Recall, Context Entity Recall (RAG-based)."
     78       },
     79       "human_evaluation": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "The paper describes a planned field study with surveys and interviews but states 'a forthcoming field study will evaluate its pedagogical impact.' Results are not yet available. The system makes claims about educational trustworthiness that warrant human evaluation."
     83       },
     84       "held_out_test_set": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No mention of train/test/validation splits for the CyberQ evaluation dataset. It is unclear whether any tuning was done on the same data used for reporting."
     88       },
     89       "per_category_breakdown": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Table 1 provides per-category breakdowns across Zero-shot (ZS), Few-shot (FS), and Ontology-Driven (OD) question types."
     93       },
     94       "failure_cases_discussed": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The paper discusses that 'the Faithfulness score is slightly lower because the model will leverage its own knowledge to generate answers when there is no closely relevant material in the knowledge base.' The limitations section also discusses failure modes."
     98       },
     99       "negative_results_reported": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "No negative results or failed approaches are reported. Every metric shows positive performance."
    103       }
    104     },
    105     "claims_and_evidence": {
    106       "abstract_claims_supported": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "The abstract claims CyberBOT 'reduc[es] the risk of misleading or unsafe guidance' but no comparison to a system without ontology validation is provided to support this claim. The abstract also claims 'Computational evaluations highlight the potential capacity' which is vaguely supported by Table 1 but without baselines."
    110       },
    111       "causal_claims_justified": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "The paper claims the ontology verifier 'reduces the risk of misleading or unsafe guidance' and that the framework 'benefits from related documents as references,' but provides no ablation or controlled comparison to justify these causal claims."
    115       },
    116       "generalization_bounded": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The limitations section explicitly states 'the current deployment focuses on a single graduate-level course with a limited sample size; thus, findings may not generalize to diverse educational settings or other technical domains.'"
    120       },
    121       "alternative_explanations_discussed": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "No alternative explanations for the results are discussed. The paper does not consider whether high BERTScore/ROUGE might be due to the QA pairs being derived from similar source material rather than system quality."
    125       }
    126     },
    127     "setup_transparency": {
    128       "model_versions_specified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The paper specifies 'Llama 3.3 70B' for the intent interpreter, RAG generator, and ontology verifier, and 'BAAI-Bge-Large-1.5' for embeddings (Figure 3 and Section 3.2)."
    132       },
    133       "prompts_provided": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Appendix C provides the full prompts for the intent interpreter (C.1), the LLM generation (C.2), and the ontology verifier (C.3), including the actual template text and few-shot examples."
    137       },
    138       "hyperparameters_reported": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "No temperature, top-p, max tokens, or other LLM hyperparameters are reported. The chunk size of 512 tokens for the knowledge base is mentioned but inference parameters are absent."
    142       },
    143       "scaffolding_described": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The three-stage pipeline (intent interpreter, RAG retriever with FAISS, ontology verifier) is described in detail in Sections 2.1-2.3 with workflow diagrams (Figures 1-2)."
    147       },
    148       "data_preprocessing_documented": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "The paper mentions PDFs are 'preprocess[ed] into smaller, semantically meaningful chunks' with 512 tokens, but does not describe how course materials were cleaned, filtered, or transformed beyond this brief mention."
    152       }
    153     },
    154     "limitations_and_scope": {
    155       "limitations_section_present": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "A dedicated 'Limitations' section is present, discussing four specific limitations including knowledge base coverage, limited deployment scope, ontology coverage gaps, and computational overhead."
    159       },
    160       "threats_to_validity_specific": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "The limitations are specific to this study: 'the current deployment focuses on a single graduate-level course with a limited sample size' and 'the ontology-based validation primarily checks compliance with known concepts and relationships, leaving truly novel or emergent cybersecurity issues outside its purview.'"
    164       },
    165       "scope_boundaries_stated": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "The paper explicitly states findings 'may not generalize to diverse educational settings or other technical domains' and identifies that novel cybersecurity issues are outside the system's purview."
    169       }
    170     },
    171     "data_integrity": {
    172       "raw_data_available": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No raw evaluation data, model outputs, or interaction logs are made available for verification."
    176       },
    177       "data_collection_described": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "The CyberQ dataset source is described (Section 4.2), and the quasi-experimental study design describes data collection via pre/post surveys, learning outcomes, and interviews (Section 4.3)."
    181       },
    182       "recruitment_methods_described": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section 4.3 describes recruiting 77 graduate students from CSE 546 at ASU, with Monte Carlo random assignment stratified by gender into experimental (39) and control (38) groups."
    186       },
    187       "data_pipeline_documented": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The pipeline from CyberQ dataset to final metrics in Table 1 is not documented. It is unclear how the 3,500 QA pairs were processed, whether any were filtered, or how the evaluation was run."
    191       }
    192     },
    193     "conflicts_of_interest": {
    194       "funding_disclosed": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No funding sources or acknowledgments section is present in the paper."
    198       },
    199       "affiliations_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Author affiliations are clearly listed as Arizona State University (School of Computing and Mary Lou Fulton Teachers College)."
    203       },
    204       "funder_independent_of_outcome": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not absence of conflict."
    208       },
    209       "financial_interests_declared": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No competing interests or financial interests statement is present in the paper."
    213       }
    214     },
    215     "contamination": {
    216       "training_cutoff_stated": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "The paper uses Llama 3.3 70B but does not state its training data cutoff date. The CyberQ dataset could potentially overlap with training data."
    220       },
    221       "train_test_overlap_discussed": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No discussion of whether CyberQ QA pairs or related cybersecurity content appeared in Llama 3.3's training data."
    225       },
    226       "benchmark_contamination_addressed": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "CyberQ was published in 2024 and could be in Llama 3.3's training data. No contamination analysis is provided."
    230       }
    231     },
    232     "human_studies": {
    233       "pre_registered": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No pre-registration is mentioned for the quasi-experimental study with 77 students."
    237       },
    238       "irb_or_ethics_approval": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No IRB or ethics board approval is mentioned despite collecting data from human participants. The Ethical Considerations section discusses privacy and consent but does not mention IRB approval."
    242       },
    243       "demographics_reported": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "Beyond stating participants are 'computer science ASU graduate students' stratified by gender, no demographic details (age, experience level, nationality) are reported."
    247       },
    248       "inclusion_exclusion_criteria": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No inclusion or exclusion criteria are stated. The paper says 77 students but does not explain how they were selected from the course enrollment of 100+."
    252       },
    253       "randomization_described": {
    254         "applies": true,
    255         "answer": true,
    256         "justification": "Section 4.3 describes 'Monte Carlo random assignment (Metropolis and Ulam, 1949) stratified by gender' to assign 39 students to experimental and 38 to control groups."
    257       },
    258       "blinding_described": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "No blinding is described. Students likely knew whether they had chatbot access or not, and no mention is made of evaluator blinding."
    262       },
    263       "attrition_reported": {
    264         "applies": true,
    265         "answer": false,
    266         "justification": "No attrition information is reported. The course has 100+ students but only 77 participated; this gap is unexplained."
    267       }
    268     },
    269     "cost_and_practicality": {
    270       "inference_cost_reported": {
    271         "applies": true,
    272         "answer": false,
    273         "justification": "The system uses Together AI API for Llama 3.3 70B but no API costs, tokens consumed, or latency per query are reported."
    274       },
    275       "compute_budget_stated": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "An A100 80GB is mentioned as the development server but no total compute budget, GPU hours, or API spending is stated."
    279       }
    280     }
    281   },
    282   "claims": [
    283     {
    284       "claim": "CyberBOT achieves an average BERTScore of 0.933 and Context Recall of 0.994 on the CyberQ dataset.",
    285       "evidence": "Table 1 reports these metrics across ZS, FS, and OD categories.",
    286       "supported": "moderate"
    287     },
    288     {
    289       "claim": "The ontology-based validation reduces the risk of misleading or unsafe guidance.",
    290       "evidence": "No ablation comparing system with and without ontology validation. Only the architecture description in Section 2.3.",
    291       "supported": "unsupported"
    292     },
    293     {
    294       "claim": "The framework produces higher scores in the Few-shot category than Zero-shot and Ontology-Driven categories.",
    295       "evidence": "Table 1 shows consistently higher metrics for FS across most metrics (e.g., ROUGE-1: 0.788 FS vs 0.649 ZS).",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "CyberBOT has been deployed in a live classroom with more than 100 graduate students.",
    300       "evidence": "Section 4 describes deployment in CSE 546 at ASU for Spring 2025.",
    301       "supported": "moderate"
    302     }
    303   ],
    304   "methodology_tags": ["benchmark-eval", "case-study"],
    305   "key_findings": "CyberBOT is an ontology-grounded RAG system for cybersecurity education that combines an intent interpreter, retrieval-augmented generation, and ontology-based answer validation. Computational evaluation on the CyberQ dataset shows high BERTScore (0.933) and context recall (0.994), though no baselines are compared. A quasi-experimental study with 77 students is designed but results are not yet reported. The system is deployed in a live graduate course at ASU.",
    306   "red_flags": [
    307     {
    308       "flag": "No baselines",
    309       "detail": "The computational evaluation reports only CyberBOT's metrics without any baseline comparison (e.g., vanilla LLM, RAG without ontology). It is impossible to assess whether the ontology validation adds value."
    310     },
    311     {
    312       "flag": "No ablation study",
    313       "detail": "The system has three distinct components (intent interpreter, RAG, ontology verifier) but no ablation study measures their individual contributions."
    314     },
    315     {
    316       "flag": "Premature deployment claims",
    317       "detail": "The paper emphasizes real-world deployment and educational impact but the field study results are 'forthcoming.' The paper is essentially a system description with preliminary automated metrics."
    318     },
    319     {
    320       "flag": "Missing IRB approval",
    321       "detail": "A quasi-experimental study with 77 human participants collecting surveys and interview data is described without mentioning IRB or ethics board approval."
    322     },
    323     {
    324       "flag": "No uncertainty quantification",
    325       "detail": "All metrics in Table 1 are point estimates without error bars, confidence intervals, or variance across runs."
    326     }
    327   ],
    328   "cited_papers": [
    329     {
    330       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    331       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"],
    332       "year": 2020,
    333       "relevance": "Foundational RAG paper relevant to understanding retrieval-augmented generation approaches in AI systems."
    334     },
    335     {
    336       "title": "RAGAS: Automated evaluation of retrieval augmented generation",
    337       "authors": ["Shahul Es", "Jithin James", "Luis Espinosa Anke", "Steven Schockaert"],
    338       "year": 2024,
    339       "relevance": "RAG evaluation framework used in this paper, relevant to benchmarking and evaluation methodology for RAG systems."
    340     },
    341     {
    342       "title": "Seven failure points when engineering a retrieval augmented generation system",
    343       "authors": ["Scott Barnett", "Stefanus Kurniawan", "Srikanth Thudumu"],
    344       "year": 2024,
    345       "relevance": "Analysis of RAG system failure modes relevant to software engineering quality of AI systems."
    346     },
    347     {
    348       "title": "Mindful-RAG: A study of points of failure in retrieval augmented generation",
    349       "authors": ["Garima Agrawal", "Tharindu Kumarage", "Zeyad Alghamdi", "Huan Liu"],
    350       "year": 2024,
    351       "arxiv_id": "2407.12216",
    352       "relevance": "Study of RAG failure points relevant to reliability and safety of LLM-based systems."
    353     },
    354     {
    355       "title": "Universal and transferable adversarial attacks on aligned language models",
    356       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini"],
    357       "year": 2023,
    358       "arxiv_id": "2307.15043",
    359       "relevance": "Adversarial attacks on LLMs relevant to AI safety and security research."
    360     },
    361     {
    362       "title": "The wolf within: Covert injection of malice into MLLM societies via an MLLM operative",
    363       "authors": ["Zhen Tan", "Chengshuai Zhao", "Raha Moraffah"],
    364       "year": 2024,
    365       "arxiv_id": "2402.14859",
    366       "relevance": "Adversarial attacks on multi-modal LLM systems relevant to AI safety."
    367     },
    368     {
    369       "title": "\"Glue pizza and eat rocks\" - Exploiting vulnerabilities in retrieval-augmented generative models",
    370       "authors": ["Zhen Tan", "Chengshuai Zhao", "Raha Moraffah"],
    371       "year": 2024,
    372       "arxiv_id": "2406.19417",
    373       "relevance": "RAG vulnerability exploitation relevant to security of retrieval-augmented AI systems."
    374     },
    375     {
    376       "title": "Ontology-aware RAG for improved question-answering in cybersecurity education",
    377       "authors": ["Chengshuai Zhao", "Garima Agrawal", "Tharindu Kumarage"],
    378       "year": 2024,
    379       "arxiv_id": "2412.14191",
    380       "relevance": "Predecessor to CyberBOT, directly relevant to ontology-grounded RAG evaluation methodology."
    381     }
    382   ]
    383 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs