sklearn实战-乳腺癌细胞数据挖掘(博主亲自录制视频)
采集脚本
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 30 11:09:30 2017
1454页会报错
@author: toby
"""
import hashlib,bs4,requests,time,random
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
browser = webdriver.Firefox()
url="http://www.cq.gov.cn/publicmail/citizen/ReleaseMailListDistrict.aspx"
browser.get(url)
elem=browser.find_element_by_id("dgrdMail")
str_text=elem.text
list_text=str_text.split("\n")
data=pd.DataFrame(list_text)
data.to_excel("7076.xlsx")
x=10047
while x<21960:
#翻页
linkElem = browser.find_element_by_id("btnNext")
linkElem.click()
elem=browser.find_element_by_id("dgrdMail")
str_text=elem.text
list_text=str_text.split("\n")
data=pd.DataFrame(list_text)
data.to_excel(str(x)+".xlsx")
x+=1
time.sleep(random.randint(1,4))
if x%100==0:
time.sleep(300)
可视化脚本
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 27 15:30:27 2017
@author: toby
"""
import copy
import jieba
import pandas
import jieba.analyse
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
font = r'C:\Windows\Fonts\simfang.ttf'
filename="文本内容.txt"
def List_none_sense():
file_noneSense_words=open("nonesense_words.txt",'r')
list_noneSense_words=[]
for line in file_noneSense_words:
line_list=line.split()
for word in line_list:
list_noneSense_words.append(word)
return list_noneSense_words
#word遍历行,word处理,去除各种标点,怪异符号,最后计算word出现个数
#判断一个单词是否在垃圾词表里
def none_sense_words_Judeg(word):
if word in list_noneSense_words:
return True
else:
return False
#过滤停用词列表
def filter_none_sense_words(list1):
list2=[]
for i in list1:
#如果不在垃圾词库里或不是数字,汉字长度大于1
if none_sense_words_Judeg(i[0])==False and i[0].isdigit()==False and len(i[0])>1:
#print("remove",i)
list2.append(i)
return list2
#对文件分词
def fenci(filename) :
f = open(filename,'rb')
#所有文件保存为一个字符串
file_list = f.read()
#大写转换为小写
file_list =file_list.lower()
f.close()
#分词
seg_list = list(jieba.cut(file_list,cut_all=False))
#生成一个字典,每个单词排名
tf={}
for seg in seg_list :
#print seg
seg = ''.join(seg.split())
if (seg != '' and seg != "\n" and seg != "\n\n") :
if seg in tf :
tf[seg] += 1
else :
tf[seg] = 1
return sorted(tf.items(),key=lambda item:item[1],reverse=True)
'''
f = open("result.txt","w+")
for item in tf:
#print item
f.write(item+" "+str(tf[item])+"\n")
f.close()
'''
#筛选排名大于三的
def more3_filter(list1):
list2=[]
for i in list1:
if i[1]>2:
list2.append(i)
return list2
'''
#词语相似度
def Similarity(theList):
newList=copy.deepcopy(theList)
for i in range(len(newList)):
if newList[i][0] in newList[i+1][0]:
name=newList[i][0]
value=newList[i][1]+newList[i+1][1]
newItem=(name,value)
print(newItem)
del newList[i]
del newList[i+1]
newList.append(newItem)
return newList
'''
list_noneSense_words=List_none_sense()
#分词
list_words=fenci(filename)
#过滤停用词
list_words2=filter_none_sense_words(list_words)
list_words3=more3_filter(list_words2)
#前二十
list_words4=list_words3[:10]
for i in list_words4:
print (i)
#写入数据到Excel,用pandas的df数据结构
df=pandas.DataFrame(list_words2)
df.to_excel("result.xlsx")
#标签云传输的frequency必须是字典形式,所以要转换
list_words3=dict(list_words3)
wc = WordCloud(collocations=False, font_path=font, width=1400, height=1400, margin=2).generate_from_frequencies(list_words3)
plt.imshow(wc)
plt.axis("off")
plt.show()
wc.to_file('show_Chinese.png') # 把词云保存下来