如何从Spark文本分析中筛选出排除停用词后的5个高频词?

2026-06-11 10:084阅读0评论SEO资讯
  • 内容介绍
  • 相关推荐

本文共计207个文字,预计阅读时间需要1分钟。

如何从Spark文本分析中筛选出排除停用词后的5个高频词?

scalapackage com.yl.wordcountimport java.io.Fileimport org.apache.spark.{SparkConf, SparkContext}import scala.collection.Iteratorimport scala.io.Source

object WordCount { def main(args: Array[String]): Unit={ val conf=new SparkConf().setAppName(Word Count) val sc=new SparkContext(conf) val file=new File(path/to/input.txt) val text=Source.fromFile(file).getLines().toList val counts=sc.parallelize(text).flatMap(_.split( )).map((_, 1)).reduceByKey(_ + _) counts.saveAsTextFile(path/to/output) }}

packagecom.yl.wordcountimportjava.io.Fileimportorg.apache.spark.{SparkConf,SparkContext}imports package com.yl.wordcountimport java.io.Fileimport org.apache.spark.{SparkConf, SparkContext}import scala.collection.Iteratorimport scala.io.Source/** * wordcount进行排序并排除停用词 */object WordCountStopWords { def main(args: Array[String]) { val cOnf= new SparkConf().setMaster("spark://localhost:7077").setAppName("wordcount") val sc = new SparkContext(conf) val outFile = "/Users/admin/spark/sparkoutput" var stopWords:Iterator[String] = null val stopWordsFile = new File("/Users/admin/src"+"/tingyongci.txt") if(stopWordsFile.exists()){ stopWords = Source.fromFile(stopWordsFile).getLines } val stopWordList = stopWords.toList val textFile = sc.textFile("/Users/admin/spark/spark-1.5.1-bin-hadoop2.4/README.md") val result = textFile.flatMap(_.split(" ")).filter(!_.isEmpty).filter(!stopWordList.contains(_)).map((_,1)).reduceByKey(_+_).map{case (word,count) =>(count,word)}.sortByKey(false) result.saveAsTextFile(outFile) } }

如何从Spark文本分析中筛选出排除停用词后的5个高频词?

本文共计207个文字,预计阅读时间需要1分钟。

如何从Spark文本分析中筛选出排除停用词后的5个高频词?

scalapackage com.yl.wordcountimport java.io.Fileimport org.apache.spark.{SparkConf, SparkContext}import scala.collection.Iteratorimport scala.io.Source

object WordCount { def main(args: Array[String]): Unit={ val conf=new SparkConf().setAppName(Word Count) val sc=new SparkContext(conf) val file=new File(path/to/input.txt) val text=Source.fromFile(file).getLines().toList val counts=sc.parallelize(text).flatMap(_.split( )).map((_, 1)).reduceByKey(_ + _) counts.saveAsTextFile(path/to/output) }}

packagecom.yl.wordcountimportjava.io.Fileimportorg.apache.spark.{SparkConf,SparkContext}imports package com.yl.wordcountimport java.io.Fileimport org.apache.spark.{SparkConf, SparkContext}import scala.collection.Iteratorimport scala.io.Source/** * wordcount进行排序并排除停用词 */object WordCountStopWords { def main(args: Array[String]) { val cOnf= new SparkConf().setMaster("spark://localhost:7077").setAppName("wordcount") val sc = new SparkContext(conf) val outFile = "/Users/admin/spark/sparkoutput" var stopWords:Iterator[String] = null val stopWordsFile = new File("/Users/admin/src"+"/tingyongci.txt") if(stopWordsFile.exists()){ stopWords = Source.fromFile(stopWordsFile).getLines } val stopWordList = stopWords.toList val textFile = sc.textFile("/Users/admin/spark/spark-1.5.1-bin-hadoop2.4/README.md") val result = textFile.flatMap(_.split(" ")).filter(!_.isEmpty).filter(!stopWordList.contains(_)).map((_,1)).reduceByKey(_+_).map{case (word,count) =>(count,word)}.sortByKey(false) result.saveAsTextFile(outFile) } }

如何从Spark文本分析中筛选出排除停用词后的5个高频词?