from pyspark import SparkConf ,SparkContext
import jieba.posseg as posseg
"""
将 两个 列表进行组装 能让算子进行计算哦
nous_list=[['淘宝'], ['故事', '高德地图'], ['故事', '笑话', '绕口令'], []]
st1_verb_list=['打开', '打开', '$', '跑']
"""
def my_tuple(nous_list, st1_verb_list):
my_list = []
tuple = ()
for x in range(len(nous_list)):
for y in range(len(nous_list[x])):
result0 = nous_list[x][y]
result1 = st1_verb_list[x]
tuple = (result0, result1)
my_list.append(tuple)
return my_list
def merge(list):
result=[]
for x in range(len(list)):
result.extend(list[x])
list.clear()
return result
"""
移除verb_list中的所有动词
Tagging_list = [['v', 'v','n'], ['n', 'v','n'],['n','n','n']]
verb_list = [['打开', '背','淘宝'], ['故事', '打开','高德地图'],['故事','笑话','绕口令']]
"""
Tagging_list = [['v','v','n'], ['n', 'v','n'],['n','n','n']]
verb_list = [['打开', '背','淘宝'], ['故事', '打开','高德地图'],['故事','笑话','绕口令']]
def remove_verb(Tagging_list,verb_list):
for x in range(len(Tagging_list)):
verb_list[x] =list(verb_list[x])
for y in range(len(Tagging_list[x])):
if Tagging_list[x][y]=='v':
verb_list[x][y]='Null'
#list_Index = myfind('v',Tagging_list[x])
#verb_list[x][list_Index]='Null'
for x in range(len(verb_list)):
try:
while 'Null' in verb_list[x]:
verb_list[x].remove("Null")
except ValueError:
continue
return verb_list
# """
# 这里是遍历每一条一句里的列表内容
# [['v', 'v'], ['n', 'v']]
# [['打开', '背'], ['故事', '打开']]
# """
# def get_1st_verb(verb_list,Tagging_list,get_verb=False):
# result_V=""
# result_N=""
# ery_sentence_verb=[]
# for x in range(len(Tagging_list)):
# for y in range(len(Tagging_list[x])):
# if 'v' == Tagging_list[x][y] and get_verb==False:
# result_V = verb_list[x][y]
# ery_sentence_verb.append(result_V)
# get_verb = True
# else:
# result_V='$'
# ery_sentence_verb.append(result_V)
# return ery_sentence_verb
"""
获取每组数组的动词
返回:['打开', '打开', '$']
"""
def get_1st_verb(verb_list, Tagging_list):
st1_verb_list = []
for x in range(len(Tagging_list)):
try:
index = ""
index = Tagging_list[x].index('v')
except ValueError:
index = -1
finally:
if index != -1:
result_V = verb_list[x][index]
st1_verb_list.append(result_V)
else:
result_V = '$'
st1_verb_list.append(result_V)
return st1_verb_list
def split(line):
# 分词 + 词性
Two_verb = []
verb_list = []
Tagging_list = []
seg_list = posseg.cut(line)
for word,flag in seg_list:
# if len(word)==1 and flag =='v':
# first_verb.append(word)
# #result0 += word + ','
# elif len(word)==2 and flag =='v':
# Two_verb.append(word)
# else:
# continue
if flag=='v' or flag=='n':
verb_list.append(word)
Tagging_list.append(flag)
#dictMerged2 = dict(first_verb,**Two_verb)
#Tage_list [['v', 'v'], ['n', 'v']]
#verb_list [['打开', '背'], ['故事', '打开']]
st1_verb_list=get_1st_verb(verb_list,Tagging_list)
nous_list=remove_verb(Tagging_list,verb_list)
result_list=my_tuple(nous_list,st1_verb_list)
print(st1_verb_list)
return st1_verb_list
#return result0
def main(sc):
#读取文件
text = sc.textFile("D:/NAV.txt")
#进行分词,结果转化列表
word_list = text.map(split).collect()
print(len(word_list))
#print(word_list)
#input_list=merge(word_list)
#print(input_list)
#返回列表中的第一个元素
#input_list=[('打开','淘宝'),('背','故事'),('打开','淘宝'),('打开','淘宝'),('背','故事'),('背','古诗')]
#input_list=['(淘宝,打开)', '(故事,打开)', '(高德地图,打开)', '(故事,$)', '(笑话,$)', '(绕口令,$)']
#print(word_list)
# count = sc.parallelize(word_list)
#final_rdd=count.map(lambda word:(word,1)).collect()
#print(final_rdd)
#results = count.map(lambda word:(word,1)).reduceByKey(lambda a,b:a+b).collect()
#.repartition(1).saveAsTextFile("C:\\Users\\yunduo\\Desktop\\22.txt")
#print(results)
if __name__ =="__main__":
#Create SparkConf
sparkConf =SparkConf().setAppName('Python').setMaster('local[2]')
#Create SparkContext
sc=SparkContext(conf=sparkConf)
main(sc)
#print("Fv"+first_verb)
# print("TV"+Two_verb)
头疼
最新推荐文章于 2025-05-29 18:45:10 发布