import sys
import os
import json
import re
import numpy as np
def PraseRawdata(author = None,constrain = None,src='./chinese-poetry/json/simplified', category="poet.tang"):
def sentenceParse(para):
res, num = re.subn(u'(.*)','',para)
res,num = re.subn(u'{.*}','',res)
res,num = re.subn(u'《.*》','',res)
res,num = re.subn(u'[\]\[]','',res)
r = ''
for i in res:
if i not in set('0123456789-'):
r+=i
r,num = re.subn(u'。。','。',r)
return r
def haddlejson(file):
rst =[]
data = json.loads(open(file).read())
for poetry in data:
pdata =""
if(author is not None and poetry.get("author")!= author):
return None
p = poetry.get("paragraphs")
flag = False
for s in p:
sp = re.split(u"[,!。]", s)
for tr in sp:
if constrain is not None and len(tr) != constra