# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('gbk')
import pandas as pd
import string
def cuttxt(fname,char):
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
#print i[0:7]
if not i[0:7]==" ":
print i[0:7]
f.write(i+chr(10)+" "+char+" "+chr(10))
else:
f.write(i+chr(10))
#print i+"ok"
#print char
f.close()
def cuttxtline(fname,char):
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
#print i[0:7]
if not i[0:7]==" ":
print i[0:7]
f.write(i+chr(10)+" "+char+" "+chr(10))
else:
f.write(i+chr(10))
#print i+"ok"
#print char
f.close()
import shutil
import os
shutil.copy('readme.txt', 'test.txt') #复制文件
def fuhaoconghoufenge(fname,char):#从分隔符后面切
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
f.write(i.strip()+char+chr(10))
#print i+"ok"
#print char
f.close()
def fuhaofengedel(fname,char): #分隔符 去掉,不要了
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
f.write(i.strip()+chr(10))
#print i+"ok"
#print char
f.close()
def hebing(fname,char1,char2): #分隔符 去掉,不要了
f=file(fname,'r')
string=f.read()
line1=string.split(char1+chr(10)+char2)
f=file(fname,'w+')
for i in line1:
if len(i)>3:
f.write(i.strip()+chr(10)+char1+char2+chr(10))
#print i+"ok"
#print char
f.close()
def lianggecidoubufenkai(fname,char1,char2):
fuhaocongqianhoufenge(fname,char2)
hebing(fname,char1,char2)
#s =unicode(str1) # 将中英文统一成 一个字符, 然后开始处理
#for i in range(len(s)+1):
# print s[i:i+1]
def tihuan(fname,char,char1): #分隔符 去掉,不要了
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
f.write(i.strip()+chr(10))
#print i+"ok"
#print char
f.close()
def replacetxt(fname,char,char1):
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
f.write(char1+i)
f.close()
#######################
import shutil
import os
fnamezx='dicc1.csv'
fnamezxk='test.txt'
#复制文件
shutil.copy('readme.txt',fnamezxk)
#把csv读入数据库,取每一行,
#然后对文章xxxx.txt分词,这应该是前后分
def duiwenzhangfenci (fname1, fname2): #用词库fname2对文件fname1分词
df=pd.DataFrame(pd.read_csv(fname2,header=0))
for indexs in df.index:
char12k=df['word'].iloc[indexs]
fuhaocongqianhoufenge(fname1,char12k)
def fuhaocongqianhoufenge(fname,char):#从分隔符后面切
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
f.write(i.strip()+chr(10)+char+chr(10))
f.close()
############
duiwenzhangfenci (fnamezxk, fnamezx)
import sys
reload(sys)
sys.setdefaultencoding('gbk')
import pandas as pd
import string
def cuttxt(fname,char):
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
#print i[0:7]
if not i[0:7]==" ":
print i[0:7]
f.write(i+chr(10)+" "+char+" "+chr(10))
else:
f.write(i+chr(10))
#print i+"ok"
#print char
f.close()
def cuttxtline(fname,char):
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
#print i[0:7]
if not i[0:7]==" ":
print i[0:7]
f.write(i+chr(10)+" "+char+" "+chr(10))
else:
f.write(i+chr(10))
#print i+"ok"
#print char
f.close()
import shutil
import os
shutil.copy('readme.txt', 'test.txt') #复制文件
def fuhaoconghoufenge(fname,char):#从分隔符后面切
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
f.write(i.strip()+char+chr(10))
#print i+"ok"
#print char
f.close()
def fuhaofengedel(fname,char): #分隔符 去掉,不要了
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
f.write(i.strip()+chr(10))
#print i+"ok"
#print char
f.close()
def hebing(fname,char1,char2): #分隔符 去掉,不要了
f=file(fname,'r')
string=f.read()
line1=string.split(char1+chr(10)+char2)
f=file(fname,'w+')
for i in line1:
if len(i)>3:
f.write(i.strip()+chr(10)+char1+char2+chr(10))
#print i+"ok"
#print char
f.close()
def lianggecidoubufenkai(fname,char1,char2):
fuhaocongqianhoufenge(fname,char2)
hebing(fname,char1,char2)
#s =unicode(str1) # 将中英文统一成 一个字符, 然后开始处理
#for i in range(len(s)+1):
# print s[i:i+1]
def tihuan(fname,char,char1): #分隔符 去掉,不要了
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
f.write(i.strip()+chr(10))
#print i+"ok"
#print char
f.close()
def replacetxt(fname,char,char1):
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
f.write(char1+i)
f.close()
#######################
import shutil
import os
fnamezx='dicc1.csv'
fnamezxk='test.txt'
#复制文件
shutil.copy('readme.txt',fnamezxk)
#把csv读入数据库,取每一行,
#然后对文章xxxx.txt分词,这应该是前后分
def duiwenzhangfenci (fname1, fname2): #用词库fname2对文件fname1分词
df=pd.DataFrame(pd.read_csv(fname2,header=0))
for indexs in df.index:
char12k=df['word'].iloc[indexs]
fuhaocongqianhoufenge(fname1,char12k)
def fuhaocongqianhoufenge(fname,char):#从分隔符后面切
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
f.write(i.strip()+chr(10)+char+chr(10))
f.close()
############
duiwenzhangfenci (fnamezxk, fnamezx)