scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20106B)
      1 {
      2   "paper": {
      3     "title": "Advancing engineering research through context-aware and knowledge graph–based retrieval-augmented generation",
      4     "authors": ["Soham Ghosh", "Gaurav Mittal"],
      5     "year": 2025,
      6     "venue": "Frontiers in Artificial Intelligence",
      7     "doi": "10.3389/frai.2025.1697169"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub repository URL provided in the Data Availability section: https://github.com/sghosh27/Low-Code-RAG-LLM-Framework-for-Context-Aware-Querying-in-Electrical-Standards-Design-and-Research along with Python code for document parsing and a scoring notebook."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The n8n workflow templates and Python notebooks are released via the GitHub repository. The evaluation queries and ground truth answers are published in Tables 2 and 5 of the paper."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is mentioned. The paper names tools (LM Studio, AnythingLLM, n8n, InfraNodus) but does not specify library versions or dependencies needed to recreate the environment."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While n8n workflow JSON files are released, there are no step-by-step reproduction instructions describing how to set up the full pipeline, configure the tools, or replicate the experiments end-to-end."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The paper reports median and interquartile range (IQR) for contextual vs traditional RAG: 0.83 [0.79, 0.89] and 0.62 [0.26, 0.77] respectively."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "A one-sided paired Wilcoxon Signed test was conducted to compare contextual RAG vs traditional RAG, reporting p < 0.0001 at significance level 0.05."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Median scores with baselines are reported: contextual RAG median 0.83 vs traditional RAG median 0.62, providing sufficient context to understand the magnitude of improvement."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is provided for the number of evaluation queries used. The sample sizes (6 queries in Table 2, 3 in Table 5, 2 in Table 6) are small with no discussion of whether this is sufficient."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "IQR is reported for contextual and traditional RAG scores: 0.83 [0.79, 0.89] and 0.62 [0.26, 0.77]."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Traditional RAG is used as a baseline against contextual RAG (Table 5). For reranking experiments, a no-reranking baseline is included alongside bge-reranker-base and Cohere V3.5 (Table 3)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include current tools: Cohere Rerank V3.5, bge-reranker-base, OpenAI embeddings, Voyage embeddings, Google-PaLM embeddings, which are contemporary."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study is conducted to isolate which specific components of the contextual RAG pipeline contribute most to the improvement. The comparisons are between whole systems."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Hit rate and MRR are used for reranking evaluation (Table 3). GEval correctness scoring is used for response quality evaluation (Tables 2 and 5)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of system outputs is conducted. Evaluation relies entirely on automated GEval scoring against ground truth answers. Practitioner engineers helped design queries and ground truth but did not evaluate outputs."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "There is no mention of held-out test sets. The same queries appear to be used for both development and evaluation, with no separation described."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Individual query-level scores are reported in Tables 2 and 5, showing per-query performance for both traditional and contextual RAG."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4 discusses failures with numerical table retrieval, multi-condition exceptions, and chunking limitations. Q5 and Q6 in Table 2 show low scores (0.26, 0.20) with explanations of why the system failed."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that traditional RAG failed on table lookups (Q6 scored 0.20), knowledge graphs are poor for numerical tabular data, and both RAG approaches still hallucinate on complex multi-layered prompts."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims the frameworks improve relevance and promote deeper semantic understanding, which are supported by the experimental results in Tables 2, 3, and 5 showing improvements from reranking, contextual RAG, and knowledge graph RAG."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper claims contextual RAG 'improves' over traditional RAG, which is a causal claim. While a Wilcoxon test is used, the sample of queries is very small (3 paired comparisons in Table 5) and not randomly sampled, making the causal design weak."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title claims 'Advancing engineering research' broadly, but results are tested only on the 2017 NEC codebook and a small set of electrical engineering queries. No bounding of generalization to this specific domain is stated in the abstract or title."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations are discussed for the observed improvements. For example, the improvement from contextual RAG could be due to larger chunk sizes rather than the contextual augmentation itself, but this is not considered."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper mentions 'Llama 4 Scout 17B 16E' for contextual RAG and lists models like 'Cohere Rerank v3.5' and 'bge-reranker-base', but does not specify exact versions or snapshot dates for the LLMs used in the main experiments (Tables 2 and 5). The specific model used for the traditional RAG experiments is not clearly stated."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Figure 5B shows the actual prompt template used for contextual RAG. The evaluation prompts are provided verbatim in Tables 2, 5, and 6. The InfraNodus system message is also quoted."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper mentions that temperature and top-K can be tuned in LM Studio but does not report the actual values used. Chunk size of 2,000 with overlap of 20 is stated for Table 5, but other hyperparameters (temperature, top-K, top-p) are not reported."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The n8n workflow architecture is described in detail with workflow diagrams (Figures 3, 4, 5, 7) showing document ingestion, embedding, vector storage, retrieval, reranking, and LLM generation steps."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The paper describes converting PDFs to Markdown, chunking strategies (chunk size 2,000, overlap 20), and the embedding process. The contextual RAG preprocessing with parent context augmentation is also described."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 7 (Summary and future work) discusses limitations including higher API costs for knowledge graph generation, limitations with numerical tabular data, and remaining hallucination issues. Limitations are also discussed throughout Sections 4 and 5."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity section exists. The limitations discussed are about the RAG systems' functional shortcomings, not about threats to the validity of the study's findings (e.g., small sample size, selection of queries, lack of generalizability)."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound its claims to the NEC domain or acknowledge that results may not transfer to other engineering standards or document types."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The raw evaluation scores and individual query results are shown in tables, but the underlying NEC document used (proprietary NFPA content) is not available, and the full set of scoring outputs is not released as raw data files."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The paper describes that queries were developed collaboratively with practicing electrical engineering professionals, designed to cover NEC chapters, with ground truth developed by those practitioners."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants in the study. Practitioner engineers helped design queries but were not study subjects. The data source is a standard engineering codebook (NEC 2017)."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from document upload through parsing, embedding, vector storage, retrieval, and LLM generation is documented across Sections 3-6 with workflow diagrams."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The Funding section states: 'The author(s) declare that no financial support was received for the research and/or publication of this article.'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations with Black & Veatch are disclosed in the header and in the Conflict of Interest section: 'SG and GM were employed at Black & Veatch.'"
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "The paper declares no financial support was received, making this criterion not applicable (unfunded work)."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The Conflict of Interest section discloses employment at Black & Veatch. The Author Disclaimer states the study was conducted independently without institutional or agency sponsorship."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper uses LLMs to answer questions about the NEC but does not state the training data cutoff for any model used. The LLMs may have seen NEC content during training, which would inflate RAG performance measurements."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether the NEC 2017 content or similar engineering standards appeared in the LLMs' training data, which could confound the evaluation of RAG effectiveness."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The NEC 2017 is a widely available document that may have been in the training data of the models used. This contamination risk is not addressed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in the study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in the study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in the study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in the study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The paper discusses cost qualitatively (e.g., knowledge graph RAG uses 5-10x more tokens, Table 4 lists 'Embedding cost' as 'Low' vs 'Higher') but does not report actual inference costs, API spend, or per-query costs."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget, GPU hours, or API spend is reported despite the paper using multiple commercial APIs (OpenAI, Cohere, Voyage)."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Contextual RAG significantly outperforms traditional RAG on NEC-based engineering queries",
    286       "evidence": "Wilcoxon Signed test p < 0.0001; median scores 0.83 vs 0.62 (Section 5.2, Table 5)",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "OpenAI + Cohere rerank combination achieves the highest retrieval performance",
    291       "evidence": "Table 3 shows OpenAI + Cohere V3.5 achieving hit rate 0.932 and MRR 0.877, highest across all configurations",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Knowledge graph-based RAG can traverse multiple linked knowledge graphs to answer cross-domain queries",
    296       "evidence": "Table 6 shows two qualitative examples where the AI agent consulted multiple 'brains' to compose answers",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "Traditional RAG struggles with numerical table retrieval and multi-condition exceptions in engineering codes",
    301       "evidence": "Table 2 Q5 (score 0.26) and Q6 (score 0.20) demonstrate failures; qualitative analysis in Section 4",
    302       "supported": "moderate"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval", "case-study"],
    306   "key_findings": "The paper presents rapidly deployable RAG-LLM pipelines using n8n for engineering document querying. Contextual RAG significantly outperforms traditional RAG on NEC-based queries (median 0.83 vs 0.62, p < 0.0001). Reranking with Cohere V3.5 consistently improves retrieval metrics across embedding models. Knowledge graph-based RAG enables cross-domain exploratory queries but at 5-10x higher API cost, and both traditional and contextual RAG still hallucinate on complex multi-layered prompts.",
    307   "red_flags": [
    308     {
    309       "flag": "Very small evaluation sample",
    310       "detail": "Only 6 queries for traditional RAG (Table 2), 3 paired comparisons for contextual vs traditional RAG (Table 5), and 2 queries for knowledge graph RAG (Table 6). Statistical claims from such small samples are fragile."
    311     },
    312     {
    313       "flag": "No contamination analysis",
    314       "detail": "The NEC 2017 is a widely available document. LLMs may have seen it during training, meaning the RAG system's apparent accuracy could be partly due to the model's pre-existing knowledge rather than retrieval quality."
    315     },
    316     {
    317       "flag": "Automated evaluation only",
    318       "detail": "All evaluation uses DeepEval's GEval metric, which itself relies on an LLM to score outputs. No human expert evaluated whether the RAG system's answers were correct for engineering decision-making."
    319     },
    320     {
    321       "flag": "Unspecified model versions",
    322       "detail": "The exact LLM models and versions used for the main experiments are not clearly stated, making reproduction difficult."
    323     }
    324   ],
    325   "cited_papers": [
    326     {
    327       "title": "HaluEval: a large-scale hallucination evaluation benchmark",
    328       "authors": ["J. Li", "X. Cheng", "W. X. Zhao", "J.Y. Nie", "J.R. Wen"],
    329       "year": 2023,
    330       "relevance": "Benchmark for evaluating LLM hallucination, relevant to methodology quality in LLM evaluation."
    331     },
    332     {
    333       "title": "Hallucination detection: robustly discerning reliable answers in large language models",
    334       "authors": ["Y. Chen", "Q. Fu", "Y. Yuan", "Z. Wen", "G. Fan", "D. Liu"],
    335       "year": 2023,
    336       "relevance": "Methods for detecting LLM hallucinations, relevant to LLM reliability and safety research."
    337     },
    338     {
    339       "title": "Retrieval Augmented Generation using engineering design knowledge",
    340       "authors": ["L. Siddharth", "J. Luo"],
    341       "year": 2024,
    342       "doi": "10.1016/j.knosys.2024.112410",
    343       "relevance": "RAG applied to engineering domains, directly relevant to evaluating RAG methodology quality."
    344     },
    345     {
    346       "title": "Enhancing engineering education through LLM-driven adaptive quiz generation: a RAG-based approach",
    347       "authors": ["S. Gopi", "D. Sreekanth", "N. Dehboz"],
    348       "year": 2024,
    349       "doi": "10.1109/FIE61694.2024.10893146",
    350       "relevance": "RAG application in engineering education, relevant to LLM application methodology."
    351     },
    352     {
    353       "title": "Large language models are highly vulnerable to adversarial hallucination attacks in clinical decision support: a multi-model assurance analysis",
    354       "authors": ["M. Omar", "V. Sorin", "J. D. Collins"],
    355       "year": 2025,
    356       "doi": "10.1101/2025.03.18.25324184",
    357       "relevance": "LLM vulnerability to adversarial attacks causing hallucinations, relevant to LLM safety."
    358     },
    359     {
    360       "title": "Enhancing uncertainty-based hallucination detection with stronger focus",
    361       "authors": ["T. Zhang", "L. Qiu", "Q. Guo"],
    362       "year": 2023,
    363       "relevance": "Hallucination detection methodology for LLMs, relevant to LLM reliability research."
    364     }
    365   ]
    366 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs