- 本节内容的数据见电脑F:\python数据\Python海量数据(精缩版) 或 百度网盘“我的数据文件/Python海量数据”
一、先了解下QQ群的数据
1.QQ目录下的文件
(QQ目录下的文件是按照QQ群号有序排列的,我们根据QQ群找这个群里的QQ就很方便,但要是根据一个人的QQ找它所在的群这就麻烦了)
2.QQqun目录下的文件
(因为数据是有序的,两个文件归并的时候就不用排序)
以下是对QQ群数据的合并、检索、制作索引、快速查找
一、QQ群数据的合并
1.把这么多文件里面的数据合并到一个大文件里去
import os
filedir="/mnt/hgfs/E/QQ项目/QUN/qqQun"
filelist=[]
for i in range(1,111): #该目录下一共有110个数据文件
filelist.append(filedir+"/QunList"+str(i)+".txt")
#filelist=["/mnt/hgfs/E/QQ项目/QUN/qqQun/QunList1.txt","/mnt/hgfs/E/QQ项目/QUN/qqQun/QunList2.txt",,,]
allfile=open("/mnt/hgfs/E/QQ项目/QUN/qqQun/allQQqun.txt","wb")
for filepath in filelist:
tmpfile = open(filepath, "rb")
tmplist=tmpfile.readlines()
for line in tmplist:
allfile.write(line)
tmpfile.close()
allfile.close()
2.把这么多文件里面的数据进行处理然后再合并到一个大文件里去
filedir="/mnt/hgfs/E/newQQ/QQqun"
filelist=[]
for i in range(1,111):
filelist.append(filedir+"/QunList"+str(i)+".txt")
allqunfile=open("/mnt/hgfs/E/newQQ/QQqunalllite.txt","wb")
for filepath in filelist:
tmpfile = open(filepath, "rb")
tmplist = tmpfile.readlines()
for line in tmplist:
line=line.decode("utf-8","ignore") #解码
linelist=line.split("\t") #切割
if len(linelist)==7: #如果改行不是7个数据,说明改行缺失信息,那么就是垃圾数据,我们不做处理
Qunid=linelist[1].replace("\"","")
Qunname=linelist[4].replace("\"", "")
Quntitle=linelist[6].replace("\"", "")
#抓取重要的数据,替换符号,
wline=Qunid+"\t"+Qunname+"\t"+Quntitle
allqunfile.write(wline.encode("utf-8"))
allqunfile.close()
二、补充:jieba库如何做到模糊搜索
1.jieba库
import jieba
mystr="我今天与朋友聊天聊到了美女与野兽"
mycut=jieba.cut(mystr)
print(",".join(mycut)) #我,今天,与,朋友,聊天,聊到,了,美女,与,野兽
print("--".join(mycut)) #我--今天--与--朋友--聊天--聊到--了--美女--与--野兽
2.分词搜索
import jieba
import jieba.posseg
mystr="软件工程"
laststr="软件工程1班"
jiebastr=",".join(jieba.cut(mystr,cut_all=True)) #这种分词会罗列出所有的分词可能:['软件', '软件工程', '工程']
wordlist=jiebastr.split(",") #jiebastr=软件,软件工程,工程 wordlist=['软件', '软件工程', '工程']
print(wordlist)
length=len(wordlist)
getlength=0
for word in wordlist:
if laststr.find(word)!=-1:
getlength+=1
print(getlength/length)
三、信息搜索
我们根据刚刚合并的数据,从里面搜索”妈妈“,得到的那些群号就可以做母婴用品的推销
从里面搜索”软件工程“就可以向那些群号做IT方面的培训广告
1.常规的普通搜索
filepath="/mnt/hgfs/E/newQQ/QQqunalllite.txt"
datafile=open(filepath,"rb")
while True:
searchstr=input("要查询的数据")
datafile.seek(0,0)
while True:
line =datafile.readline()
if not line: #(读到最后一行就跳出循环)
break
else:
line =line.decode("utf-8")
if line.find(searchstr)!=-1:
print(line,end="")
datafile.close()
2.模糊搜索
import jieba
import jieba.posseg
def findata(mystr,laststr):
jiebastr=",".join(jieba.cut(mystr,cut_all=True))
wordlist=jiebastr.split(",")
length=len(wordlist)
getlength=0
for word in wordlist:
if laststr.find(word)!=-1:
getlength+=1
return getlength/length
filepath="/mnt/hgfs/E/newQQ/QQqunalllite.txt"
datafile=open(filepath,"rb")
datalist=datafile.readlines() #这种一次性读取所有行的做法就是将几G的数据一次性读入内存,好处是读入以后查询非常快
print("load mem")
while True:
searchstr=input("要查询的数据")
datafile.seek(0,0)
for line in datalist:
line =line.decode("utf-8")
#if line.find(searchstr)!=-1:
if findata(searchstr,line)>=0.3: #匹配系数只要大于0.3那就输出来(这样就比之前查出来的数据更多)
print(line,end="")
datafile.close()
五、制作索引
我们根据刚刚合并的数据,制作索引
1.第一种土豪的做法— 一次性读入
filepath="/mnt/hgfs/E/newQQ/QQqunalllite.txt"
Qunfile=open(filepath,"rb")
Qunlist=Qunfile.readlines() #一次性读入内存,土豪的做法
print(len(Qunlist)) #86907937
lengthlist=[0]
for line in Qunlist:
lengthlist.append(len(line))
del Qunlist
print("list")
i=1
length=len(lengthlist)
while i<length-1:
lengthlist[i]+=lengthlist[i-1]
i+=1
del lengthlist[length-1] #删除最后一位
print("sort")
savefilepath="/mnt/hgfs/E/newQQ/QQqunallliteindex.txt"
savefile=open(savefilepath,"wb")
for data in lengthlist:
savefile.write(format(data,"15d").encode("utf-8"))
savefile.close()
Qunfile.close()
2.比较节约内存的做法— 一行一行的读入
filepath="/mnt/hgfs/E/newQQ/QQqunalllite.txt"
Qunfile=open(filepath,"rb")
lengthlist=[0]
while True:
line=Qunfile.readline() #一行一行的读入
if not line:
break
else:
lengthlist.append(len(line))
print("list")
i=1
length=len(lengthlist)
while i<length-1:
lengthlist[i]+=lengthlist[i-1]
i+=1
del lengthlist[length-1]
print("sort")
savefilepath="/mnt/hgfs/E/newQQ/QQqunallliteindex_disk1.txt"
savefile=open(savefilepath,"wb")
for data in lengthlist:
savefile.write(format(data,"15d").encode("utf-8"))
savefile.close()
Qunfile.close()
3.最有利于内存的方法-----边读边写
filepath="/mnt/hgfs/E/newQQ/QQqunalllite.txt"
Qunfile=open(filepath,"rb")
savefilepath="/mnt/hgfs/E/newQQ/QQqunallliteindex_disk2.txt"
savefile=open(savefilepath,"wb")
pos=0
savefile.write(format(pos,"15d").encode("utf-8"))
while True:
line=Qunfile.readline()
if not line:
break
else:
pos +=len(line)
savefile.write(format(pos, "15d").encode("utf-8"))
savefile.close()
Qunfile.close()
六、根据索引快速查找
1.随机访问
csdnfilepath="/mnt/hgfs/E/newQQ/QQqunalllite.txt"
csdnindexfilepath="/mnt/hgfs/E/newQQ/QQqunallliteindex.txt"
csdnfile=open(csdnfilepath,"rb")
csdnindexfile=open(csdnindexfilepath,"rb")
while True:
linenum=eval(input("input lines"))
csdnindexfile.seek(15*(linenum-1),0) #跳到索引文件的中间位置
lineval=csdnindexfile.read(15)#读取10个字符
lineval=eval(lineval)#转化为数字
csdnfile.seek(lineval,0) #根据索引取出位置
line=csdnfile.readline()
line=line.decode("utf-8","ignore")
print(line)
csdnindexfile.close()
csdnfile.close()
2.二分查找
def search2(searchstr):
low = 0 # 第一个
high = 86907937-1 # 代表最后一个
times = 0
while low <= high: # 不能重叠
times += 1
print("times", times)
mid = (low + high) // 2 # 取出中间索引
csdnindexfile.seek(15 * (mid - 1), 0) # 跳到索引文件的中间位置
lineval = csdnindexfile.read(15) # 读取10个字符
lineval = eval(lineval) # 转化为数字
csdnfile.seek(lineval, 0) # 根据索引取出位置
line = csdnfile.readline()
line = line.decode("utf-8", "ignore")
linelist=line.split("\t")
middata=linelist[0]
if searchstr< middata: # 小于 淘汰1半
high = mid - 1
elif searchstr > middata: # 小于 淘汰1半
low = mid + 1
else:
print("find", line, mid)
return mid
print("not find")
return -1
csdnfilepath="/mnt/hgfs/E/newQQ/QQqunalllite.txt"
csdnindexfilepath="/mnt/hgfs/E/newQQ/QQqunallliteindex.txt"
csdnfile=open(csdnfilepath,"rb")
csdnindexfile=open(csdnindexfilepath,"rb")
while True:
searchstr=input("input searchstr") #输入你要查找的字符串
search2(searchstr)
csdnindexfile.close()
csdnfile.close()
以下是对QQ文件的合并、制作索引、根据索引迅速查找
一、合并QQ文件
右边文件中依次是(编号、QQ号、QQ名、年龄、性别、 ? 、QQ群号)
我们依次拿到(QQ号:21605735、QQ名:晴子、年龄:22,QQ群号:100100)
filedir="/mnt/hgfs/E/newQQ/QQ"
filelist=[] #文件列表
for i in range(1,111): #批量生成110个文件加入列表
filelist.append(filedir+"/Group"+str(i)+".txt")
#用于归并的文件
allqunfile=open("/mnt/hgfs/E/newQQ/QQall.txt","wb")
#每个文件读取一次,每个文件写入归并的文件
for filepath in filelist:
print(filepath)
tmpfile = open(filepath, "rb") #打开文件
tmplist = tmpfile.readlines() #读取所有行
for line in tmplist: #所有文件的行,批量写入归并
line=line.decode("utf-8")
linelist=line.split("\t")
QQ= linelist[1].replace("\"", "")
QQname = linelist[2].replace("\"", "")
QQage = linelist[3].replace("\"", "")
QQqun = linelist[6].replace("\"", "")
# 抓取重要的数据,替换符号,
wline = QQ + "\t" + QQname + "\t" +QQage +"\t"+QQqun
allqunfile.write(wline.encode("utf-8"))
allqunfile.close()
得到的文件是按照QQ群号有序排列的,所以它比较适合根据QQ群号找这个群里的QQ号
二、QQ数据的索引
QQall.txt文件是按照QQ群号有序排列的,所以它比较适合根据QQ群号找这个群里的QQ号
filepath="/mnt/hgfs/E/newQQ/QQall.txt"
Qunfile=open(filepath,"rb")
savefilepath="/mnt/hgfs/E/newQQ/QQallindex.txt"
savefile=open(savefilepath,"wb")
pos=0
savefile.write(format(pos,"15d").encode("utf-8"))
while True:
line=Qunfile.readline()
if not line:
break
else:
pos +=len(line)
savefile.write(format(pos, "15d").encode("utf-8"))
savefile.close()
Qunfile.close()
三、根据索引随机访问QQ群
csdnfilepath="/mnt/hgfs/E/newQQ/qun_data/qun_name_qq.txt"
csdnindexfilepath="/mnt/hgfs/E/newQQ/qun_data/qun_qq_index.txt"
csdnfile=open(csdnfilepath,"rb")
csdnindexfile=open(csdnindexfilepath,"rb")
while True:
linenum=eval(input("input lines"))
csdnindexfile.seek(15*(linenum-1),0) #跳到索引文件的中间位置
lineval=csdnindexfile.read(15)#读取15个字符
lineval=eval(lineval)#转化为数字
csdnfile.seek(lineval,0) #根据索引取出位置
line=csdnfile.readline()
line=line.decode("utf-8","ignore")
print(line)
csdnindexfile.close()
csdnfile.close()
四、根据QQ群查找QQ
def search2(searchstr):
low = 0 # 第一个
high = 1449403409-1 # 代表最后一个
while low <= high: # 不能重叠
mid = (low + high) // 2 # 取出中间索引
csdnindexfile.seek(15 * (mid - 1), 0) # 跳到索引文件的中间位置
lineval = csdnindexfile.read(15) # 读取15个字符
lineval = eval(lineval) # 转化为数字
csdnfile.seek(lineval, 0) # 根据索引取出位置
line = csdnfile.readline()
line = line.decode("utf-8", "ignore")
linelist=line.split(" # ")
middata=linelist[2]
middata=eval(middata)
if searchstr< middata: # 小于 淘汰1半
high = mid - 1
elif searchstr > middata: # 小于 淘汰1半
low = mid + 1
else:
#print("find", mid,line)
QQlist=[]
QQlist.append(line) # 返回一个列表,多个QQ,先加入找到的第一个
tmp_up=mid #循环,向上查找
while True:
tmp_up-=1 #循环向上
if tmp_up<low: #不可以低于下限
break
#索引文件取出位置tmp_up
csdnindexfile.seek(15 * (tmp_up - 1), 0)
uplineval = csdnindexfile.read(15) # 读取15个字符
uplineval = eval(uplineval) # 转化为数字
# 根据索引文件在文件中取出位置tuplinelist[2]群号
csdnfile.seek(uplineval, 0) # 根据索引取出位置
upline = csdnfile.readline()
upline = upline.decode("utf-8", "ignore")
uplinelist = upline.split(" # ")
upmiddata = uplinelist[2]
upmiddata = eval(upmiddata)
if searchstr== upmiddata : #相等就继续,不等跳出循环
#print(upline,tmp_up)
QQlist.append(upline)
else:
break
tmp_down=mid #向下循环
while True:
tmp_down+=1 #向下移动,
if tmp_down>high: #不能高于上限
break
# 索引文件取出位置tmp_down
csdnindexfile.seek(15 * (tmp_down - 1), 0)
downlineval = csdnindexfile.read(15)
downlineval = eval(downlineval) # 转化为数字
# 根据索引文件在文件中取出位置tuplinelist[2]群号
csdnfile.seek(downlineval, 0) # 根据索引取出位置
downline = csdnfile.readline()
downline = downline.decode("utf-8", "ignore")
downlinelist = downline.split(" # ")
downmiddata = downlinelist[2]
downmiddata = eval(downmiddata)
if searchstr == downmiddata :#相等就继续,不等跳出循环
#print(downline, tmp_down)
QQlist.append( downline )
else:
break
return QQlist
print("not find")
return -1
csdnfilepath="/mnt/hgfs/E/newQQ/qun_data/qun_name_qq.txt"
csdnindexfilepath="/mnt/hgfs/E/newQQ/qun_data/qun_qq_index.txt"
csdnfile=open(csdnfilepath,"rb")
csdnindexfile=open(csdnindexfilepath,"rb")
while True:
searchstr=eval(input("input searchstr"))
QQlist=search2(searchstr)
for QQline in QQlist:
print(QQline,end="")
csdnindexfile.close()
csdnfile.close()
五、根据QQ查找它所在的QQ群
那么如何根据QQ查找它所在的QQ群呢?
按理来讲我们要根据QQ查找它所在的QQ群需要一份按照QQ号排列的正序文件
下面的qq_name_qun.txt是乱序的,它虽然从QQ群号上看是正序的,但是从QQ号上看是乱序的,所以我们做了一份倒排索引文件(index.txt是按照QQ号正序排列制作的的索引,只要有一个正序的索引,就可以了)
根据QQ查找它所在的QQ群
def search2(searchstr):
low = 0 # 第一个
high = 1449403409-1 # 代表最后一个
while low <= high: # 不能重叠
mid = (low + high) // 2 # 取出中间索引
csdnindexfile.seek(15 * (mid - 1), 0) # 跳到索引文件的中间位置
lineval = csdnindexfile.read(15) # 读取15个字符
lineval = eval(lineval) # 转化为数字
csdnfile.seek(lineval, 0) # 根据索引取出位置
line = csdnfile.readline()
line = line.decode("utf-8", "ignore")
linelist=line.split(" # ")
middata=linelist[0]
#middata=eval(middata)
if searchstr< middata: # 小于 淘汰1半
high = mid - 1
elif searchstr > middata: # 小于 淘汰1半
low = mid + 1
else:
QQlist = []
QQlist.append(line) # 返回一个列表,多个QQ,先加入找到的第一个
tmp_up = mid # 循环,向上查找
while True:
tmp_up -= 1 # 循环向上
if tmp_up < low: # 不可以低于下限
break
if tmp_up-1<0:
break
# 索引文件取出位置tmp_up
csdnindexfile.seek(15 * (tmp_up - 1), 0) # 跳到索引文件的中间位置
uplineval = csdnindexfile.read(15) # 读取10个字符
uplineval = eval(uplineval) # 转化为数字
# 根据索引文件在文件中取出位置tuplinelist[2]群号
csdnfile.seek(uplineval, 0) # 根据索引取出位置
upline = csdnfile.readline()
upline = upline.decode("utf-8", "ignore")
uplinelist = upline.split(" # ")
upmiddata = uplinelist[0]
#upmiddata = eval(upmiddata)
if searchstr == upmiddata: # 相等就继续,不等跳出循环
# print(upline,tmp_up)
QQlist.append(upline)
else:
break
tmp_down = mid # 向下循环
while True:
tmp_down += 1 # 向下移动,
if tmp_down > high: # 不能高于上限
break
# 索引文件取出位置tmp_down
csdnindexfile.seek(15 * (tmp_down - 1), 0) # 跳到索引文件的中间位置
downlineval = csdnindexfile.read(15) # 读取10个字符
downlineval = eval(downlineval) # 转化为数字
# 根据索引文件在文件中取出位置tuplinelist[2]群号
csdnfile.seek(downlineval, 0) # 根据索引取出位置
downline = csdnfile.readline()
downline = downline.decode("utf-8", "ignore")
downlinelist = downline.split(" # ")
downmiddata = downlinelist[0]
#downmiddata = eval(downmiddata)
if searchstr == downmiddata: # 相等就继续,不等跳出循环
# print(downline, tmp_down)
QQlist.append(downline)
else:
break
return QQlist
print("not find")
return -1
csdnfilepath="/mnt/hgfs/E/newQQ/qq_data/qq_name_qun.txt"
csdnindexfilepath="/mnt/hgfs/E/newQQ/qq_data/index.txt"
csdnfile=open(csdnfilepath,"rb")
csdnindexfile=open(csdnindexfilepath,"rb")
while True:
searchstr=input("input searchstr")
QQqunlist=search2(searchstr)
#print(QQlist)
for QQqun in QQqunlist:
print(QQqun)
csdnindexfile.close()
csdnfile.close()