from pyspark import SparkConf ,SparkContext import jieba.posseg as posseg def merge(list): result=[] for x in range(len(list)): result.extend(list[x]) list.clear() return result def split(line): # 分词 + 词性 Two_verb = [] verb_list = [] seg_list = posseg.cut(line) for word,flag in seg_list: # if len(word)==1 and flag =='v': # first_verb.append(word) # #result0 += word + ',' # elif len(word)==2 and flag =='v': # Two_verb.append(word) # else: # continue if len(word)<3 and flag=='v': verb_list.append(word) #dictMerged2 = dict(first_verb,**Two_verb) #first_verb return verb_list #return result0 def main(sc): #读取文件 text = sc.textFile("D:/NAV.txt") #进行分词,结果转化列表 word_list = text.map(split).collect() print(len(word_list)) print(word_list) input_list=merge(word_list) print(input_list) #返回列表中的第一个元素 count = sc.parallelize(input_list) results = count.map(lambda word:(word,1)).reduceByKey(lambda a,b:a+b).collect() #.repartition(1).saveAsTextFile("C:\\Users\\yunduo\\Desktop\\22.txt") print(results) if __name__ =="__main__": #Create SparkConf sparkConf =SparkConf().setAppName('Python').setMaster('local[2]') #Create SparkContext sc=SparkContext(conf=sparkConf) main(sc) #print("Fv"+first_verb) # print("TV"+Two_verb)
2
[['打开', '背'], ['打开']]
['打开', '背', '打开']
[Stage 1:> (0 + 2) / 2]D:\spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\shuffle.py:58: UserWarning: Please install psutil to have better support with spilling
D:\spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\shuffle.py:58: UserWarning: Please install psutil to have better support with spilling
[Stage 2:> (0 + 2) / 2]D:\spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\shuffle.py:58: UserWarning: Please install psutil to have better support with spilling
D:\spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\shuffle.py:58: UserWarning: Please install psutil to have better support with spilling
[('背', 1), ('打开', 2)]
注意,就是split以后,这个数组很乃求,可能是map函数的原因,一行一个数组,这样传入到 map或者flatmap就会报错,map--->toomany
flatmap-->list not hash
所以你只能变成一个数组。merge以后就OK了。
好了。动词已拿到,排个序取个top 还是绵绵的吧? 接着就是考虑动词和名词如何关联取出了。讷讷。