scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26050B)
      1 {
      2   "paper": {
      3     "title": "Large Language Model (LLM) for Telecommunications: A Comprehensive Survey on Principles, Key Techniques, and Opportunities",
      4     "authors": [
      5       "Hao Zhou",
      6       "Chengming Hu",
      7       "Ye Yuan",
      8       "Yufei Cui",
      9       "Yili Jin",
     10       "Can Chen",
     11       "Haolun Wu",
     12       "Dun Yuan",
     13       "Li Jiang",
     14       "Di Wu",
     15       "Xue Liu",
     16       "Charlie Zhang",
     17       "Xianbin Wang",
     18       "Jiangchuan Liu"
     19     ],
     20     "year": 2024,
     21     "venue": "IEEE Communications Surveys and Tutorials",
     22     "arxiv_id": "2405.10825",
     23     "doi": "10.1109/COMST.2024.3465447"
     24   },
     25   "scan_version": 2,
     26   "active_modules": ["survey_methodology"],
     27   "methodology_tags": ["meta-analysis"],
     28   "key_findings": "This survey categorizes LLM applications in telecommunications into generation (domain knowledge, code, network configuration), classification (security, text, image, traffic), optimization (RL reward design, black-box, convex, heuristic), and prediction (time-series, multi-modal). The paper identifies that telecom-specific LLM training is bottlenecked by limited domain datasets, and that prompt engineering and multi-step planning are critical for complex telecom tasks. Key challenges include practical deployment (latency, compute constraints at network edge), hallucination in safety-critical applications, and the high cost of LLM training and inference for telecom operators.",
     29   "checklist": {
     30     "artifacts": {
     31       "code_released": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No code repository, analysis scripts, or supplementary materials are released. The paper is a narrative survey with no associated code artifacts."
     35       },
     36       "data_released": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No curated paper list, extracted data, or search corpus is released. The survey references existing datasets (e.g., TeleQnA [125], EdgeIIoTset [155]) but does not release its own compiled data."
     40       },
     41       "environment_specified": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "As a survey paper, it could have provided a reproducible analysis environment, but no environment specifications are given."
     45       },
     46       "reproduction_instructions": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "No instructions for reproducing the survey's paper selection, categorization, or analysis are provided."
     50       }
     51     },
     52     "statistical_methodology": {
     53       "confidence_intervals_or_error_bars": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "Survey paper with no original experiments. All reported numbers are from cited studies."
     57       },
     58       "significance_tests": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "Survey paper with no original experiments or statistical comparisons."
     62       },
     63       "effect_sizes_reported": {
     64         "applies": false,
     65         "answer": false,
     66         "justification": "Survey paper with no original experiments. Effect sizes mentioned are from cited works."
     67       },
     68       "sample_size_justified": {
     69         "applies": false,
     70         "answer": false,
     71         "justification": "Survey paper with no original experiments or sample collection."
     72       },
     73       "variance_reported": {
     74         "applies": false,
     75         "answer": false,
     76         "justification": "Survey paper with no original experiments."
     77       }
     78     },
     79     "evaluation_design": {
     80       "baselines_included": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Table I explicitly compares this survey with 12 existing surveys ([15]-[17], [28]-[35]) across 20 topic dimensions, clearly positioning this work against prior literature reviews."
     84       },
     85       "baselines_contemporary": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The compared surveys are from 2023-2024, which is contemporary to this work's 2024 publication. References include [15]-[17] and [28]-[35], all recent."
     89       },
     90       "ablation_study": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "Survey paper with no system components to ablate."
     94       },
     95       "multiple_metrics": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "Survey paper with no experiments requiring metrics."
     99       },
    100       "human_evaluation": {
    101         "applies": false,
    102         "answer": false,
    103         "justification": "Survey paper with no system outputs to evaluate."
    104       },
    105       "held_out_test_set": {
    106         "applies": false,
    107         "answer": false,
    108         "justification": "Survey paper with no experiments."
    109       },
    110       "per_category_breakdown": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The survey provides extensive per-category breakdowns: Tables IV (generation studies), VI (classification studies), VIII (optimization studies), IX (optimization techniques summary), XI (prediction techniques), and XII (telecom datasets). Each category has its own dedicated section with detailed analysis."
    114       },
    115       "failure_cases_discussed": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Section VIII-A discusses challenges and failure modes: telecom-domain LLM training difficulties, practical deployment issues (latency, compute constraints), prompt engineering challenges, and hallucination problems. Specific failures are noted, e.g., the failed FFT prompt in Section IV-C2 and LLMs giving different answers to the same telecom question (Section IV-B3)."
    119       },
    120       "negative_results_reported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The paper reports several negative findings: LLMs struggle with complex planning tasks (Section VIII-B2), GPT-4 and LLaMA give conflicting answers to telecom questions (Section IV-B3, different frequency band answers), best correct rate for troubleshooting is only ~60% (Section IV-B3), and monolithic prompts fail for complex coding tasks (Section IV-C2)."
    124       }
    125     },
    126     "claims_and_evidence": {
    127       "abstract_claims_supported": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The abstract claims the survey provides a 'comprehensive overview of LLM-enabled telecom networks' covering generation, classification, optimization, and prediction. The body delivers on these claims with dedicated sections (IV-VII) for each topic, extensive tables, and detailed discussion of techniques and applications."
    131       },
    132       "causal_claims_justified": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper relays causal claims from source papers without assessing their methodological quality. For example, it states 'LLM greatly reduced the coding time of undergraduate and graduate students by 65.16% and 68.44%' (from [14]) and 'LLM-designed reward functions can rival or even surpass manually designed reward functions' (from [42]-[44]) without evaluating whether the underlying study designs support causal inference."
    136       },
    137       "generalization_bounded": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper frequently extrapolates from small-scale studies to broad telecom potential without bounding. For example, Section IV-C generalizes from a few coding studies ([14], [18], [119]-[121]) to 'LLM-aided coding can greatly save human effort' for telecom broadly. The title claims 'comprehensive survey' but many proposed applications (e.g., multi-modal LLM for CSI prediction, verbal RL for network optimization) have zero empirical validation in telecom settings."
    141       },
    142       "alternative_explanations_discussed": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "The paper does not discuss alternative explanations for the reported findings. For instance, when reporting coding time reductions from [14], no consideration is given to novelty effects, task selection bias, or learning-curve confounds. Section VIII discusses challenges but focuses on implementation obstacles, not alternative interpretations of results."
    146       },
    147       "proxy_outcome_distinction": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "The paper does not distinguish between proxies and actual outcomes. For example, it equates 'coding accuracy' on specific tasks with general 'code generation capability,' and 'question answering accuracy' on curated datasets with 'telecom domain knowledge.' The gap between benchmark measurements and real-world telecom deployment performance is never acknowledged."
    151       }
    152     },
    153     "setup_transparency": {
    154       "model_versions_specified": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "Survey paper that does not use models directly."
    158       },
    159       "prompts_provided": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "Survey paper that does not use prompting directly. Prompt examples shown are from cited works."
    163       },
    164       "hyperparameters_reported": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "Survey paper that does not run experiments."
    168       },
    169       "scaffolding_described": {
    170         "applies": false,
    171         "answer": false,
    172         "justification": "Survey paper that does not use agentic scaffolding."
    173       },
    174       "data_preprocessing_documented": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No paper selection pipeline is described. The survey does not explain how papers were found, what databases were searched, what search queries were used, what time period was covered, or what inclusion/exclusion criteria were applied."
    178       }
    179     },
    180     "limitations_and_scope": {
    181       "limitations_section_present": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "Section VIII discusses 'Challenges and Future Directions' but these are challenges of applying LLMs to telecom, not limitations of the survey methodology itself. There is no section discussing the survey's own limitations (e.g., potential selection bias in papers reviewed, coverage gaps, or methodological limitations of the review process)."
    185       },
    186       "threats_to_validity_specific": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "No threats to validity are discussed for the survey itself. The challenges section (VIII-A) discusses telecom-domain challenges, not the survey's methodological threats."
    190       },
    191       "scope_boundaries_stated": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section I states clear scope boundaries: 'this work focuses on generative models that were originally developed for language tasks,' distinguishes 'foundation models' from 'LLM-enabled' approaches, defines when 'LLM' vs 'multi-modal LLM' terms are used, and explicitly states the paper covers 'nearly 20 telecom application scenarios and LLM-inspired novel techniques.'"
    195       }
    196     },
    197     "data_integrity": {
    198       "raw_data_available": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No raw data (search results, paper lists, coding sheets) are available for verification. The set of papers reviewed is not enumerable from the paper alone."
    202       },
    203       "data_collection_described": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No description of how the surveyed papers were collected. There is no search strategy, no database list, no query terms, and no description of the literature search process."
    207       },
    208       "recruitment_methods_described": {
    209         "applies": false,
    210         "answer": false,
    211         "justification": "No human participants. The paper is a literature survey reviewing published works."
    212       },
    213       "data_pipeline_documented": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No pipeline from literature search to final paper selection is documented. The reader cannot determine how papers were found, screened, or included/excluded."
    217       }
    218     },
    219     "conflicts_of_interest": {
    220       "funding_disclosed": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding sources or acknowledgments section is visible in the provided paper text. No grants, sponsors, or funding agencies are mentioned."
    224       },
    225       "affiliations_disclosed": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "Author affiliations are clearly listed: McGill University, Samsung Research America (Charlie Zhang), Western University (Xianbin Wang), and Simon Fraser University (Jiangchuan Liu)."
    229       },
    230       "funder_independent_of_outcome": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No funding is disclosed, making independence unverifiable. One author (Charlie Zhang) is from Samsung Research America, which has commercial interests in both telecommunications and AI/LLM technologies."
    234       },
    235       "financial_interests_declared": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No competing interests or financial interests statement is present in the paper."
    239       }
    240     },
    241     "contamination": {
    242       "training_cutoff_stated": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "Survey paper that does not evaluate pre-trained models on benchmarks."
    246       },
    247       "train_test_overlap_discussed": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "Survey paper that does not evaluate pre-trained models on benchmarks."
    251       },
    252       "benchmark_contamination_addressed": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "Survey paper that does not evaluate pre-trained models on benchmarks."
    256       }
    257     },
    258     "human_studies": {
    259       "pre_registered": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "Survey paper with no human participants."
    263       },
    264       "irb_or_ethics_approval": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "Survey paper with no human participants."
    268       },
    269       "demographics_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "Survey paper with no human participants."
    273       },
    274       "inclusion_exclusion_criteria": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "Survey paper with no human participants."
    278       },
    279       "randomization_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "Survey paper with no human participants."
    283       },
    284       "blinding_described": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "Survey paper with no human participants."
    288       },
    289       "attrition_reported": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "Survey paper with no human participants."
    293       }
    294     },
    295     "cost_and_practicality": {
    296       "inference_cost_reported": {
    297         "applies": false,
    298         "answer": false,
    299         "justification": "Survey paper with no method of its own. Cost discussion in Section VIII-B9 is about the field, not the survey's own methodology."
    300       },
    301       "compute_budget_stated": {
    302         "applies": false,
    303         "answer": false,
    304         "justification": "Survey paper with no computational experiments."
    305       }
    306     },
    307     "survey_methodology": {
    308       "prisma_or_structured_protocol": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No PRISMA flow diagram, no structured search strategy, no reproducible queries, and no reference to any established review methodology. The paper appears to use ad-hoc paper collection rather than a systematic protocol."
    312       },
    313       "quality_assessment_of_sources": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No quality assessment of the surveyed papers is performed. All cited studies are treated equally regardless of methodological rigor. For example, the paper cites master's theses ([23], [115], [116]), workshop papers, and top-venue publications interchangeably without evaluating study quality."
    317       },
    318       "publication_bias_discussed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "No discussion of publication bias. The survey does not consider whether the reviewed literature skews toward positive results about LLM capabilities in telecom, nor does it discuss potential underrepresentation of negative findings."
    322       }
    323     }
    324   },
    325   "claims": [
    326     {
    327       "claim": "LLMs can reduce coding time by 65.16% for undergraduates and 68.44% for graduates when developing FPGA-based wireless communication systems",
    328       "evidence": "Cited from Du et al. [14], which used ChatGPT for Verilog code generation in the OpenWiFi project. Specific percentages reported in Section IV-C.",
    329       "supported": "moderate"
    330     },
    331     {
    332       "claim": "Combining GPT-4 with proper libraries (e.g., NetworkX) can achieve 88% and 78% coding accuracy for traffic analysis and network lifecycle management",
    333       "evidence": "Cited from Mani et al. [119] in Section IV-C2. The experiment used application-specific and code generation prompts.",
    334       "supported": "moderate"
    335     },
    336     {
    337       "claim": "SecurityBERT achieves average accuracy of 0.98, recall of 0.84, and F1-score of 0.84 for detecting 14 distinct network attacks, significantly surpassing traditional ML and DL models",
    338       "evidence": "Cited from Ferrag et al. [139] in Section V-B2. Used EdgeIIoTset dataset for IoT/IIoT threats.",
    339       "supported": "moderate"
    340     },
    341     {
    342       "claim": "LLM-aided automated reward function design produces comparable or even superior performance to human manual designs in robotics and control tasks",
    343       "evidence": "Cited from Song et al. [42], Kwon et al. [43], and Ma et al. [44] in Section VI-B1. Ma et al. report outperforming human experts on 83% of tasks.",
    344       "supported": "moderate"
    345     },
    346     {
    347       "claim": "GPT-4 and LLaMA give different answers to the same telecom domain question, potentially misleading non-expert users",
    348       "evidence": "Section IV-B3 reports that GPT-4 identifies 5G spectrum bands as 'below 1 GHz, 1-6 GHz and above 6 GHz' while LLaMA identifies them as 'below 600 MHz, 600 MHz-24 GHz and above 24 GHz.' From Soman et al. [117].",
    349       "supported": "strong"
    350     },
    351     {
    352       "claim": "ET-BERT achieves improvements of 5.4%, 0.2%, and 5.2% over state-of-the-art methods for encrypted traffic classification tasks",
    353       "evidence": "Cited from Lin et al. [146] in Section V-E. Tested across general encrypted application classification, encrypted malware classification, and VPN traffic classification.",
    354       "supported": "moderate"
    355     },
    356     {
    357       "claim": "LLM-aided OptiMUS achieves nearly 0.8 success rate for solving 41 linear programming and 11 mixed-integer linear programming problems",
    358       "evidence": "Cited from AhmadiTeshnizi et al. [179] in Section VI-D. Uses GPT-4 for automated problem modeling, code generation, and solver implementation.",
    359       "supported": "moderate"
    360     },
    361     {
    362       "claim": "Multi-modal LLMs offer promising solutions for integrated sensing and communication in 6G networks",
    363       "evidence": "Section VII-E discusses potential applications including CSI prediction, beamforming, traffic load prediction, and QoE prediction. However, no empirical validation of multi-modal LLMs for telecom prediction is cited.",
    364       "supported": "weak"
    365     }
    366   ],
    367   "red_flags": [
    368     {
    369       "flag": "No systematic review methodology",
    370       "detail": "The survey does not follow PRISMA or any structured review protocol. No search strategy, database list, search queries, or inclusion/exclusion criteria are provided. Paper selection appears ad-hoc, making the survey non-reproducible and potentially subject to selection bias."
    371     },
    372     {
    373       "flag": "No quality assessment of source papers",
    374       "detail": "All cited studies are treated equally regardless of methodological rigor. Master's theses ([23], [115], [116]), workshop papers, and top-venue publications are cited interchangeably. Claims from small-scale studies with limited evaluation (e.g., [14] with a few students) are presented alongside large-scale evaluations without quality differentiation."
    375     },
    376     {
    377       "flag": "Speculative claims without empirical validation",
    378       "detail": "Many proposed applications (multi-modal LLM for CSI prediction, verbal RL for network optimization, LLM as black-box optimizer for telecom) are presented as 'promising' without any empirical validation in telecom settings. The paper blurs the line between demonstrated capabilities and speculative future applications."
    379     },
    380     {
    381       "flag": "Undisclosed conflict of interest",
    382       "detail": "Co-author Charlie Zhang is from Samsung Research America, which has commercial interests in both telecommunications and LLM technologies. This affiliation is listed but not discussed as a potential conflict of interest. No competing interests statement is provided."
    383     },
    384     {
    385       "flag": "Overclaiming from limited evidence",
    386       "detail": "The paper frequently uses phrases like 'great potential,' 'promising opportunities,' and 'revolutionary changes' when discussing LLM applications in telecom. Many of these claims are based on results from non-telecom domains (robotics, general NLP) extrapolated to telecom without validation."
    387     }
    388   ],
    389   "cited_papers": [
    390     {
    391       "title": "The power of large language models for wireless communication system development: A case study on FPGA platforms",
    392       "authors": ["Y. Du", "S. C. Liew", "K. Chen", "Y. Shao"],
    393       "year": 2023,
    394       "arxiv_id": "2307.07319",
    395       "relevance": "Empirical study of using LLMs for Verilog code generation in wireless systems, reporting coding time reductions for students."
    396     },
    397     {
    398       "title": "Toward reproducing network research results using large language models",
    399       "authors": ["Q. Xiang", "Y. Lin", "M. Fang"],
    400       "year": 2023,
    401       "relevance": "Study of using ChatGPT to reproduce networking systems code, demonstrating LLM capabilities and limitations for code generation."
    402     },
    403     {
    404       "title": "Repairing bugs in Python assignments using large language models",
    405       "authors": ["J. Zhang", "J. Cambronero", "S. Gulwani"],
    406       "year": 2022,
    407       "arxiv_id": "2209.14876",
    408       "relevance": "LLM-based automatic program repair achieving 86.71% repair rate, relevant to AI-assisted code quality."
    409     },
    410     {
    411       "title": "Benchmarking large language models for automated Verilog RTL code generation",
    412       "authors": ["S. Thakur", "B. Ahmad", "Z. Fan"],
    413       "year": 2023,
    414       "relevance": "Benchmarking study of LLMs for hardware description language generation, showing fine-tuning improves code correctness by 26%."
    415     },
    416     {
    417       "title": "Self-refined large language model as automated reward function designer for deep reinforcement learning in robotics",
    418       "authors": ["J. Song", "Z. Zhou", "J. Liu"],
    419       "year": 2023,
    420       "arxiv_id": "2309.06687",
    421       "relevance": "LLM-based automated reward function design achieving comparable performance to human-designed rewards."
    422     },
    423     {
    424       "title": "Eureka: Human-level reward design via coding large language models",
    425       "authors": ["Y. J. Ma", "W. Liang", "G. Wang"],
    426       "year": 2023,
    427       "arxiv_id": "2310.12931",
    428       "relevance": "LLM reward design system outperforming human experts on 83% of tasks, relevant to LLM-enabled optimization."
    429     },
    430     {
    431       "title": "Reflexion: Language agents with verbal reinforcement learning",
    432       "authors": ["N. Shinn", "F. Cassano", "A. Gopinath"],
    433       "year": 2024,
    434       "relevance": "Verbal reinforcement learning framework achieving 91% on HumanEval, relevant to agentic LLM capabilities."
    435     },
    436     {
    437       "title": "Large language models as optimizers",
    438       "authors": ["C. Yang", "X. Wang", "Y. Lu"],
    439       "year": 2023,
    440       "arxiv_id": "2309.03409",
    441       "relevance": "Using LLMs for optimization via prompt design, relevant to LLM capability evaluation."
    442     },
    443     {
    444       "title": "Large language models are zero-shot time series forecasters",
    445       "authors": ["N. Gruver", "M. Finzi", "S. Qiu", "A. G. Wilson"],
    446       "year": 2024,
    447       "relevance": "Demonstrating LLM zero-shot prediction capabilities on time series data, relevant to LLM capability evaluation."
    448     },
    449     {
    450       "title": "Time-LLM: Time series forecasting by reprogramming large language models",
    451       "authors": ["M. Jin", "S. Wang", "L. Ma"],
    452       "year": 2023,
    453       "arxiv_id": "2310.01728",
    454       "relevance": "Reprogramming frozen LLMs for time series prediction using soft prompts, relevant to LLM capability beyond NLP."
    455     },
    456     {
    457       "title": "Revolutionizing cyber threat detection with large language models: A privacy-preserving BERT-based lightweight model for IoT/IIoT devices",
    458       "authors": ["M. A. Ferrag", "M. Ndhlovu", "N. Tihanyi"],
    459       "year": 2024,
    460       "relevance": "SecurityBERT for network attack detection achieving 0.98 accuracy, relevant to LLM safety and security applications."
    461     },
    462     {
    463       "title": "SecureBERT: A domain-specific language model for cybersecurity",
    464       "authors": ["E. Aghaei", "X. Niu", "W. Shadid", "E. Al-Shaer"],
    465       "year": 2022,
    466       "relevance": "Domain-specific LLM fine-tuning for cybersecurity applications, relevant to domain adaptation of LLMs."
    467     },
    468     {
    469       "title": "Enhancing network management using code generated by large language models",
    470       "authors": ["S. K. Mani", "Y. Zhou", "K. Hsieh"],
    471       "year": 2023,
    472       "relevance": "LLM-generated code for traffic analysis and network lifecycle management achieving 78-88% accuracy."
    473     }
    474   ]
    475 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs