# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('gbk')
def replacetxt(fname,char,char1):
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
f.write(i+chr(10)+char1+chr(10)+chr(10))
#print i+"ok"
#print char
f.close()
def cuttxt(fname,char):
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
#print i[0:7]
if not i[0:7]==" ":
print i[0:7]
f.write(i+chr(10)+" "+char+" "+chr(10))
else:
f.write(i+chr(10))
#print i+"ok"
#print char
f.close()
def cuttxtline(fname,char):
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
#print i[0:7]
if not i[0:7]==" ":
print i[0:7]
f.write(i+chr(10)+" "+char+" "+chr(10))
else:
f.write(i+chr(10))
#print i+"ok"
#print char
f.close()
import shutil
import os
shutil.copy('readme.txt', 'test.txt') #复制文件
def fuhaoconghoufenge(fname,char):#从分隔符后面切
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
f.write(i.strip()+char+chr(10))
#print i+"ok"
#print char
f.close()
def fuhaocongqianhoufenge(fname,char):#从分隔符后面切
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
f.write(i.strip()+chr(10)+char+chr(10))
#print i+"ok"
#print char
f.close()
def fuhaofengedel(fname,char): #分隔符 去掉,不要了
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
f.write(i.strip()+chr(10))
#print i+"ok"
#print char
f.close()
def hebing(fname,char1,char2): #分隔符 去掉,不要了
f=file(fname,'r')
string=f.read()
line1=string.split(char1+chr(10)+char2)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
f.write(i.strip()+chr(10)+char1+char2+chr(10))
#print i+"ok"
#print char
f.close()
def lianggecidoubufenkai(fname,char1,char2):
fuhaocongqianhoufenge(fname,char2)
hebing(fname,char1,char2)
#s =unicode(str1) # 将中英文统一成 一个字符, 然后开始处理
#for i in range(len(s)+1):
# print s[i:i+1]
fuhaofengedel('test.txt','、')
fuhaofengedel('test.txt','(')
fuhaofengedel('test.txt',')')
fuhaofengedel('test.txt','(')
fuhaofengedel('test.txt',')')
fuhaofengedel('test.txt',':')
fuhaofengedel('test.txt','-')
fuhaofengedel('test.txt','“')
fuhaofengedel('test.txt','”')
fuhaofengedel('test.txt',' ')
fuhaocongqianhoufenge('test.txt',',')
fuhaocongqianhoufenge('test.txt',',')
fuhaocongqianhoufenge('test.txt','。')
fuhaocongqianhoufenge('test.txt','是') #无歧义,或者很少有歧义
fuhaocongqianhoufenge('test.txt','的')
fuhaocongqianhoufenge('test.txt','这些')
fuhaocongqianhoufenge('test.txt','然后')
fuhaocongqianhoufenge('test.txt','利用')
fuhaocongqianhoufenge('test.txt','一个')
fuhaocongqianhoufenge('test.txt','含有')
fuhaocongqianhoufenge('test.txt','例如')
fuhaocongqianhoufenge('test.txt','用来')
fuhaocongqianhoufenge('test.txt','它')
fuhaocongqianhoufenge('test.txt','描述')
fuhaocongqianhoufenge('test.txt','隐含')
fuhaocongqianhoufenge('test.txt','参数')
fuhaocongqianhoufenge('test.txt','基本')
fuhaocongqianhoufenge('test.txt','参数')
fuhaocongqianhoufenge('test.txt','确定')
fuhaocongqianhoufenge('test.txt','进一步')
fuhaocongqianhoufenge('test.txt','来作')
fuhaocongqianhoufenge('test.txt','认为')
fuhaocongqianhoufenge('test.txt','系统')
fuhaocongqianhoufenge('test.txt','作为')
fuhaocongqianhoufenge('test.txt','一种')
fuhaocongqianhoufenge('test.txt','观测')
fuhaocongqianhoufenge('test.txt','观察')
fuhaocongqianhoufenge('test.txt','建模')
fuhaocongqianhoufenge('test.txt','世纪')
fuhaocongqianhoufenge('test.txt','年代')
fuhaocongqianhoufenge('test.txt','得到')
fuhaocongqianhoufenge('test.txt','随机过程')
fuhaocongqianhoufenge('test.txt','数学模型')
#fuhaocongqianhoufenge('test.txt','随机过程')
fuhaocongqianhoufenge('test.txt','奠定')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
hebing('test.txt','定','义')
hebing('test.txt','应','用')
lianggecidoubufenkai('test.txt','隐','马尔可夫过程') # 隐马尔可夫过程 与 马尔可夫过程 这两个词,都不分割
lianggecidoubufenkai('test.txt','隐','马尔可夫链')
lianggecidoubufenkai('test.txt','隐','马尔可夫模型')
#lianggecidoubufenkai('test.txt','未知','数') 这个不要随便用,因为它会将所有的“数”字进行拆分,这样不好,后面那个词只适用于长的专业名词,不适合用常用字
fuhaoconghoufenge('test.txt',chr(10)) #放在最后,把行分隔开
#首先可以用结巴分词, 然后 再把分词的结果中不理想的情况,重新处理一下。
#把所有的词都列出来,然后凡是遇到这些词都不拆分,当然如果没有遇到这些词,就按后面的那个字拆分。
#或者可以理解为,全部按字拆分,再按词重组,但是有些专业的词,如果词库中没有,就组不成了。
#所以,有把握再拆分,没把握的,不要拆。
#当你把意思明确的词,全部确定下来,那些不确定的词,也变得确定了。
#当你找到了很多确定的因素,也就把不确定的因素给排除掉了
import sys
reload(sys)
sys.setdefaultencoding('gbk')
def replacetxt(fname,char,char1):
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
f.write(i+chr(10)+char1+chr(10)+chr(10))
#print i+"ok"
#print char
f.close()
def cuttxt(fname,char):
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
#print i[0:7]
if not i[0:7]==" ":
print i[0:7]
f.write(i+chr(10)+" "+char+" "+chr(10))
else:
f.write(i+chr(10))
#print i+"ok"
#print char
f.close()
def cuttxtline(fname,char):
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
#print i[0:7]
if not i[0:7]==" ":
print i[0:7]
f.write(i+chr(10)+" "+char+" "+chr(10))
else:
f.write(i+chr(10))
#print i+"ok"
#print char
f.close()
import shutil
import os
shutil.copy('readme.txt', 'test.txt') #复制文件
def fuhaoconghoufenge(fname,char):#从分隔符后面切
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
f.write(i.strip()+char+chr(10))
#print i+"ok"
#print char
f.close()
def fuhaocongqianhoufenge(fname,char):#从分隔符后面切
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
f.write(i.strip()+chr(10)+char+chr(10))
#print i+"ok"
#print char
f.close()
def fuhaofengedel(fname,char): #分隔符 去掉,不要了
f=file(fname,'r')
string=f.read()
line1=string.split(char)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
f.write(i.strip()+chr(10))
#print i+"ok"
#print char
f.close()
def hebing(fname,char1,char2): #分隔符 去掉,不要了
f=file(fname,'r')
string=f.read()
line1=string.split(char1+chr(10)+char2)
f=file(fname,'w+')
for i in line1:
if len(i)>1:
f.write(i.strip()+chr(10)+char1+char2+chr(10))
#print i+"ok"
#print char
f.close()
def lianggecidoubufenkai(fname,char1,char2):
fuhaocongqianhoufenge(fname,char2)
hebing(fname,char1,char2)
#s =unicode(str1) # 将中英文统一成 一个字符, 然后开始处理
#for i in range(len(s)+1):
# print s[i:i+1]
fuhaofengedel('test.txt','、')
fuhaofengedel('test.txt','(')
fuhaofengedel('test.txt',')')
fuhaofengedel('test.txt','(')
fuhaofengedel('test.txt',')')
fuhaofengedel('test.txt',':')
fuhaofengedel('test.txt','-')
fuhaofengedel('test.txt','“')
fuhaofengedel('test.txt','”')
fuhaofengedel('test.txt',' ')
fuhaocongqianhoufenge('test.txt',',')
fuhaocongqianhoufenge('test.txt',',')
fuhaocongqianhoufenge('test.txt','。')
fuhaocongqianhoufenge('test.txt','是') #无歧义,或者很少有歧义
fuhaocongqianhoufenge('test.txt','的')
fuhaocongqianhoufenge('test.txt','这些')
fuhaocongqianhoufenge('test.txt','然后')
fuhaocongqianhoufenge('test.txt','利用')
fuhaocongqianhoufenge('test.txt','一个')
fuhaocongqianhoufenge('test.txt','含有')
fuhaocongqianhoufenge('test.txt','例如')
fuhaocongqianhoufenge('test.txt','用来')
fuhaocongqianhoufenge('test.txt','它')
fuhaocongqianhoufenge('test.txt','描述')
fuhaocongqianhoufenge('test.txt','隐含')
fuhaocongqianhoufenge('test.txt','参数')
fuhaocongqianhoufenge('test.txt','基本')
fuhaocongqianhoufenge('test.txt','参数')
fuhaocongqianhoufenge('test.txt','确定')
fuhaocongqianhoufenge('test.txt','进一步')
fuhaocongqianhoufenge('test.txt','来作')
fuhaocongqianhoufenge('test.txt','认为')
fuhaocongqianhoufenge('test.txt','系统')
fuhaocongqianhoufenge('test.txt','作为')
fuhaocongqianhoufenge('test.txt','一种')
fuhaocongqianhoufenge('test.txt','观测')
fuhaocongqianhoufenge('test.txt','观察')
fuhaocongqianhoufenge('test.txt','建模')
fuhaocongqianhoufenge('test.txt','世纪')
fuhaocongqianhoufenge('test.txt','年代')
fuhaocongqianhoufenge('test.txt','得到')
fuhaocongqianhoufenge('test.txt','随机过程')
fuhaocongqianhoufenge('test.txt','数学模型')
#fuhaocongqianhoufenge('test.txt','随机过程')
fuhaocongqianhoufenge('test.txt','奠定')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
#fuhaocongqianhoufenge('test.txt','随机过程')
hebing('test.txt','定','义')
hebing('test.txt','应','用')
lianggecidoubufenkai('test.txt','隐','马尔可夫过程') # 隐马尔可夫过程 与 马尔可夫过程 这两个词,都不分割
lianggecidoubufenkai('test.txt','隐','马尔可夫链')
lianggecidoubufenkai('test.txt','隐','马尔可夫模型')
#lianggecidoubufenkai('test.txt','未知','数') 这个不要随便用,因为它会将所有的“数”字进行拆分,这样不好,后面那个词只适用于长的专业名词,不适合用常用字
fuhaoconghoufenge('test.txt',chr(10)) #放在最后,把行分隔开
#首先可以用结巴分词, 然后 再把分词的结果中不理想的情况,重新处理一下。
#把所有的词都列出来,然后凡是遇到这些词都不拆分,当然如果没有遇到这些词,就按后面的那个字拆分。
#或者可以理解为,全部按字拆分,再按词重组,但是有些专业的词,如果词库中没有,就组不成了。
#所以,有把握再拆分,没把握的,不要拆。
#当你把意思明确的词,全部确定下来,那些不确定的词,也变得确定了。
#当你找到了很多确定的因素,也就把不确定的因素给排除掉了