#中文标点符号切分,需要用|插入各标点之间
with open('data/chinese.utf8', mode="rb") as f:
for line in f:
p=re.split(u',|。',line.decode('utf-8'))
for l in p:
print l
#pycharm中文list、dict显示问题:
import json
print json.dumps(p, encoding="UTF-8", ensure_ascii=False)