scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23857B)
      1 {
      2   "paper": {
      3     "title": "TelecomRAG: Taming Telecom Standards with Retrieval Augmented Generation and LLMs",
      4     "authors": [
      5       "Girma M. Yilma",
      6       "Jose A. Ayala-Romero",
      7       "Andres Garcia-Saavedra",
      8       "Xavier Costa-Perez"
      9     ],
     10     "year": 2024,
     11     "venue": "Computer Communication Review",
     12     "arxiv_id": "2406.07053",
     13     "doi": "10.1145/3711992.3711996"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No repository URL, code archive, or download link is provided anywhere in the paper. The implementation is described using Langchain, Chroma, and OpenAI APIs but no source code is released."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The underlying 3GPP specification documents are publicly available, but the authors' processed knowledge base (vector database, chunked documents, embeddings) is not released. The evaluation questions are also not released."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper mentions specific libraries (Langchain, Chroma, Gradio) and models (gpt-4-1106-preview, text-embedding-ada-002) but provides no requirements.txt, Dockerfile, or library versions. Not enough detail to recreate the environment."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No step-by-step reproduction instructions are provided. The implementation details in Section IV-A describe the architecture but do not provide runnable commands or scripts."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No quantitative results are reported at all. The evaluation is purely qualitative, consisting of a single example comparison. No confidence intervals or error bars exist."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "No statistical significance tests are used. The paper claims TelecomRAG 'surpasses' generic LLMs based on qualitative inspection of outputs, not any statistical comparison."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No effect sizes or quantitative performance differences are reported. The evaluation provides no numerical metrics of any kind."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "Section IV-B mentions 'a vast pool of technical questions' and 'a large set of technical questions' but never quantifies N. Only one example query is shown. No justification for sample size."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No variance, standard deviation, or any spread measure is reported. The paper presents no quantitative results across runs or queries."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Section IV-B compares TelecomRAG against ChatGPT-4, Gemini Ultra, and TelecomGPT (a third-party telecom-specialized ChatGPT application). Table III shows their responses to the same query."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "ChatGPT-4 and Gemini Ultra were state-of-the-art general-purpose LLMs at the time of writing (2024). TelecomGPT is a domain-specialized application. These are reasonable contemporary baselines."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "The system has multiple components (query condenser, retriever, verification & optimization module, history module) but no ablation study examines the contribution of each component."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "No quantitative metrics are used at all. The evaluation relies entirely on qualitative comparison of a single example output. Terms like 'accuracy', 'technical depth', and 'verifiability' are used descriptively but never measured."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No formal human evaluation is conducted. The authors informally judge the outputs but there is no structured evaluation protocol, no multiple evaluators, and no inter-rater agreement."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No formal test set is defined. The paper mentions evaluating on 'a large set of technical questions' but does not describe any data split or held-out test set."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "No per-category or per-topic breakdown is provided. A single example query about ECN Failure Indication is shown with no analysis across question types or difficulty levels."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "No failure cases are discussed. The paper only presents a successful example. No analysis of where TelecomRAG might fail, produce hallucinations, or retrieve irrelevant documents."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "No negative results are reported. Every aspect of TelecomRAG is presented positively with no discussion of limitations in system performance."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The abstract claims TelecomRAG 'surpasses generic LLMs, offering superior accuracy, technical depth, and verifiability.' This is supported by a single qualitative example only. The claim of consistent superiority is not substantiated by the evidence presented."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper implicitly claims that RAG causes better answers for telecom queries ('RAG offers a way to create precise, fact-based answers'). The single uncontrolled example comparison does not support causal inference — differences could stem from prompt engineering, model choice, or the specific query selected."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The abstract and conclusion claim broad superiority ('surpasses generic LLMs', 'significant value to the telecommunications field') based on one example query about ECN Failure Indication from 3GPP TS 23.334. No scope boundaries are stated regarding query types, specification complexity, or document coverage."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "No alternative explanations are discussed. The observed difference could be partly due to the example being particularly suited to RAG (a specific lookup query), GPT-4's training data cutoff missing relevant 3GPP specs, or the specific prompt engineering used."
    132       },
    133       "proxy_outcome_distinction": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper claims 'accuracy', 'technical depth', and 'verifiability' but does not define how these are measured. A single author-judged example comparison is used as a proxy for systematic evaluation of these qualities, with no acknowledgment of this gap."
    137       }
    138     },
    139     "setup_transparency": {
    140       "model_versions_specified": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Section IV-A specifies 'gpt-4-1106-preview' for the LLM and 'text-embedding-ada-002' for embeddings. These are specific versioned model identifiers."
    144       },
    145       "prompts_provided": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "Section III-B provides one example prompt fragment: 'Assume you are a 3GPP standard expert and need to provide a very comprehensive answer to a non-experienced trainee.' However, the full system prompts for query condensation and answer elaboration are not provided. The reader cannot reconstruct the actual prompts sent to the model."
    149       },
    150       "hyperparameters_reported": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "Section IV-A reports K=4 retrieved documents, chunk size of 4000 characters, and 100-character overlap. However, critical LLM inference parameters (temperature, top-p, max tokens) are not reported."
    154       },
    155       "scaffolding_described": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section III details the full pipeline architecture: standalone query generation via LLM condenser, semantic retrieval with HNSW, history module for conversation context, and verification & optimization module. Figure 1 provides an architecture diagram."
    159       },
    160       "data_preprocessing_documented": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section IV-A documents the preprocessing: files in various formats (PDF, TXT, DOCX, DOC) processed via DirectoryLoader, split into 4000-character chunks with 100-character overlap using CharacterTextSplitter, then embedded with text-embedding-ada-002."
    164       }
    165     },
    166     "limitations_and_scope": {
    167       "limitations_section_present": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "There is no limitations section or threats-to-validity section. The conclusion (Section V) mentions plans to extend the knowledge base but does not discuss any limitations of the current work."
    171       },
    172       "threats_to_validity_specific": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No threats to validity are discussed anywhere in the paper."
    176       },
    177       "scope_boundaries_stated": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No explicit scope boundaries are stated. The paper does not discuss what types of queries may not work well, which specifications are not covered, or what the system cannot do."
    181       }
    182     },
    183     "data_integrity": {
    184       "raw_data_available": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "The underlying 3GPP specs are public, but the processed vector database, evaluation questions, and system outputs are not available for verification."
    188       },
    189       "data_collection_described": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Section IV-A states the knowledge base was built from 'the complete set of 3GPP release-16 and release-18 standard specification documents' in various formats (PDF, TXT, DOCX, DOC)."
    193       },
    194       "recruitment_methods_described": {
    195         "applies": false,
    196         "answer": false,
    197         "justification": "No human participants. The data source is public 3GPP specification documents."
    198       },
    199       "data_pipeline_documented": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "The knowledge base creation pipeline is documented (loading → chunking → embedding → vector DB). However, the evaluation data pipeline is entirely undocumented — how the 'vast pool of technical questions' was created, selected, or structured is never described."
    203       }
    204     },
    205     "conflicts_of_interest": {
    206       "funding_disclosed": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No funding information, acknowledgments section, or grant numbers are provided anywhere in the paper."
    210       },
    211       "affiliations_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Author biographies at the end identify all four authors as affiliated with NEC Laboratories Europe. Author affiliations are clearly stated."
    215       },
    216       "funder_independent_of_outcome": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "All authors work at NEC Laboratories Europe and are evaluating a system they built. NEC has a commercial interest in telecom AI solutions. No funding disclosure means independence cannot be assessed."
    220       },
    221       "financial_interests_declared": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No competing interests statement or financial interests declaration is provided. NEC Labs employees evaluating their own telecom AI system creates an obvious potential conflict that is not disclosed."
    225       }
    226     },
    227     "contamination": {
    228       "training_cutoff_stated": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "The paper evaluates a RAG-based tool/system rather than testing a pre-trained model's inherent knowledge on a benchmark. The evaluation concerns system-level output quality, not model memorization."
    232       },
    233       "train_test_overlap_discussed": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "The paper tests a tool (TelecomRAG) rather than model knowledge directly. Contamination in the benchmark sense does not apply."
    237       },
    238       "benchmark_contamination_addressed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No standard benchmark is used. The evaluation is a qualitative comparison of system outputs on ad-hoc telecom questions."
    242       }
    243     },
    244     "human_studies": {
    245       "pre_registered": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants in the study."
    249       },
    250       "irb_or_ethics_approval": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in the study."
    254       },
    255       "demographics_reported": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in the study."
    259       },
    260       "inclusion_exclusion_criteria": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in the study."
    264       },
    265       "randomization_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in the study."
    269       },
    270       "blinding_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in the study."
    274       },
    275       "attrition_reported": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in the study."
    279       }
    280     },
    281     "cost_and_practicality": {
    282       "inference_cost_reported": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "No inference cost, latency, or token consumption is reported. The system uses GPT-4 API calls for both query condensation and answer generation, plus embedding API calls, but costs are never quantified."
    286       },
    287       "compute_budget_stated": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "No computational budget is stated for building the vector database (embedding all 3GPP specs) or for running inference. Hardware specifications are not mentioned."
    291       }
    292     }
    293   },
    294   "scan_version": 3,
    295   "active_modules": [],
    296   "claims": [
    297     {
    298       "claim": "TelecomRAG surpasses generic LLMs, offering superior accuracy, technical depth, and verifiability for telecom standards queries.",
    299       "evidence": "Section IV-B shows one example query about ECN Failure Indication where TelecomRAG produces a correct, referenced response (Table II) while ChatGPT-4, Gemini Ultra, and TelecomGPT produce vague or incorrect responses (Table III). The paper states 'our solution maintained this depth and accuracy across all queries in our evaluation' without showing the data.",
    300       "supported": "weak"
    301     },
    302     {
    303       "claim": "TelecomRAG provides verifiable responses by referencing specific 3GPP documents.",
    304       "evidence": "Table II shows the system's response includes a reference to '3GPP Technical Specification 23.334', which matches the ground truth document cited in Table I.",
    305       "supported": "moderate"
    306     },
    307     {
    308       "claim": "Generic LLMs (ChatGPT-4, Gemini Ultra, TelecomGPT) provide vague, inaccurate, and not verifiable responses for telecom standards queries.",
    309       "evidence": "Table III shows responses from all three systems for one query about ECN Failure Indication. ChatGPT-4 discusses general IP ECN mechanisms rather than the 3GPP-specific procedure. Gemini Ultra states it cannot find the term. TelecomGPT acknowledges the term is not in its documents.",
    310       "supported": "weak"
    311     },
    312     {
    313       "claim": "RAG provides more factual, specific, and diverse responses than language models alone for knowledge-intensive tasks.",
    314       "evidence": "This claim in Section II-C cites Lewis et al. [11] rather than presenting original evidence. The paper's own evaluation (one example) is too limited to independently support this.",
    315       "supported": "weak"
    316     }
    317   ],
    318   "methodology_tags": [
    319     "case-study"
    320   ],
    321   "key_findings": "TelecomRAG is a RAG-based assistant for telecom standards built on 3GPP Release 16/18 documents, using GPT-4 with Langchain, Chroma vector store, and HNSW search. A qualitative comparison on a single example query shows it produces a correct, referenced answer about ECN Failure Indication while ChatGPT-4, Gemini Ultra, and TelecomGPT fail to identify the relevant 3GPP specification. No quantitative evaluation metrics are reported despite claims of evaluation on 'a vast pool' of questions.",
    322   "red_flags": [
    323     {
    324       "flag": "Single example evaluation",
    325       "detail": "The entire evaluation section shows only one example query. The paper claims evaluation on 'a vast pool of technical questions' and 'a large set of technical questions' but never quantifies the evaluation set or shows aggregate results. All claims of superiority rest on a single cherry-picked example."
    326     },
    327     {
    328       "flag": "No quantitative metrics",
    329       "detail": "Despite claiming accuracy and superiority, the paper reports zero quantitative metrics — no accuracy rates, no precision/recall, no user satisfaction scores, no automated evaluation metrics. The assessment is entirely qualitative and informal."
    330     },
    331     {
    332       "flag": "Authors evaluating own system",
    333       "detail": "All four authors are from NEC Laboratories Europe and built TelecomRAG. The evaluation is conducted by the same team with no independent evaluators, no blinding, and no structured evaluation protocol. NEC has commercial interests in telecom AI solutions."
    334     },
    335     {
    336       "flag": "Claims significantly outrun evidence",
    337       "detail": "The abstract claims the system 'surpasses generic LLMs, offering superior accuracy, technical depth, and verifiability' and Section IV-B states it 'consistently provided thorough, accurate, and technically detailed responses.' These broad claims are supported by a single example comparison."
    338     },
    339     {
    340       "flag": "Potentially cherry-picked example",
    341       "detail": "The chosen example query ('What are the information elements included in the ECN Failure Indication?') is a direct factual lookup ideally suited to RAG retrieval. No examples test reasoning across multiple specifications, ambiguous queries, or queries where the knowledge base might be incomplete."
    342     }
    343   ],
    344   "cited_papers": [
    345     {
    346       "title": "Language models are few-shot learners",
    347       "authors": ["T. Brown", "B. Mann", "N. Ryder"],
    348       "year": 2020,
    349       "relevance": "Foundational GPT-3 paper introducing in-context learning capabilities of large language models."
    350     },
    351     {
    352       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    353       "authors": ["J. Wei", "X. Wang", "D. Schuurmans"],
    354       "year": 2022,
    355       "relevance": "Introduces chain-of-thought prompting technique for improving LLM reasoning on complex tasks."
    356     },
    357     {
    358       "title": "Large language models encode clinical knowledge",
    359       "authors": ["K. Singhal", "S. Azizi", "T. Tu"],
    360       "year": 2022,
    361       "arxiv_id": "2212.13138",
    362       "relevance": "Med-PaLM demonstrates domain-specific LLM adaptation for healthcare, relevant to evaluating domain adaptation approaches."
    363     },
    364     {
    365       "title": "BloombergGPT: A large language model for finance",
    366       "authors": ["S. Wu", "O. Irsoy", "S. Lu"],
    367       "year": 2023,
    368       "arxiv_id": "2303.17564",
    369       "relevance": "Domain-specific LLM trained on financial data, relevant as a comparison approach to domain adaptation via fine-tuning vs RAG."
    370     },
    371     {
    372       "title": "Galactica: A large language model for science",
    373       "authors": ["R. Taylor", "M. Kardas", "G. Cucurull"],
    374       "year": 2022,
    375       "arxiv_id": "2211.09085",
    376       "relevance": "Scientific domain LLM that still exhibited hallucination despite high-quality training data, motivating RAG approaches."
    377     },
    378     {
    379       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    380       "authors": ["P. Lewis", "E. Perez", "A. Piktus"],
    381       "year": 2020,
    382       "relevance": "Foundational RAG paper proposing retrieval-augmented generation methodology that TelecomRAG builds upon."
    383     },
    384     {
    385       "title": "REALM: Retrieval-Augmented Language Model Pre-Training",
    386       "authors": ["K. Guu", "K. Lee", "Z. Tung"],
    387       "year": 2020,
    388       "relevance": "Early retrieval-augmented language model combining non-parametric knowledge with pre-trained models."
    389     },
    390     {
    391       "title": "Dense passage retrieval for open-domain question answering",
    392       "authors": ["V. Karpukhin", "B. Oğuz", "S. Min"],
    393       "year": 2020,
    394       "arxiv_id": "2004.04906",
    395       "relevance": "Introduces dense passage retrieval used as the retrieval component in RAG systems."
    396     },
    397     {
    398       "title": "Emergent abilities of large language models",
    399       "authors": ["J. Wei", "Y. Tay", "R. Bommasani"],
    400       "year": 2022,
    401       "arxiv_id": "2206.07682",
    402       "relevance": "Documents emergent capabilities in LLMs as a function of scale, relevant to understanding LLM capability limitations."
    403     },
    404     {
    405       "title": "Creating large language model applications utilizing LangChain: A primer on developing LLM apps fast",
    406       "authors": ["O. Topsakal", "T. C. Akinci"],
    407       "year": 2023,
    408       "relevance": "Describes the LangChain framework used to implement TelecomRAG's pipeline."
    409     }
    410   ],
    411   "engagement_factors": {
    412     "practical_relevance": {
    413       "score": 2,
    414       "justification": "Telecom professionals could benefit from a standards-querying RAG system, but no code or tool is released for anyone to actually use."
    415     },
    416     "surprise_contrarian": {
    417       "score": 0,
    418       "justification": "RAG improving domain-specific QA over vanilla LLMs is the expected result and confirms conventional wisdom."
    419     },
    420     "fear_safety": {
    421       "score": 0,
    422       "justification": "No safety, security, or risk concerns raised."
    423     },
    424     "drama_conflict": {
    425       "score": 0,
    426       "justification": "No controversy or conflict — straightforward system demonstration."
    427     },
    428     "demo_ability": {
    429       "score": 0,
    430       "justification": "No code, demo, or tool released. The system is internal to NEC Labs."
    431     },
    432     "brand_recognition": {
    433       "score": 1,
    434       "justification": "NEC Laboratories Europe is a known research lab but not a top-tier AI brand. Uses GPT-4 which is widely recognized."
    435     }
    436   }
    437 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs