重庆政府公开信采集

sklearn实战-乳腺癌细胞数据挖掘(博主亲自录制视频)

 

 

 

 

采集脚本

# -*- coding: utf-8 -*-
"""
Created on Wed Aug 30 11:09:30 2017
1454页会报错
@author: toby
"""

import hashlib,bs4,requests,time,random
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd

browser = webdriver.Firefox()
url="http://www.cq.gov.cn/publicmail/citizen/ReleaseMailListDistrict.aspx"
browser.get(url)
elem=browser.find_element_by_id("dgrdMail")
str_text=elem.text
list_text=str_text.split("\n")

data=pd.DataFrame(list_text)

data.to_excel("7076.xlsx")

x=10047
while x<21960:
    #翻页
    linkElem = browser.find_element_by_id("btnNext")
    linkElem.click()
    elem=browser.find_element_by_id("dgrdMail")
    str_text=elem.text
    list_text=str_text.split("\n")
    data=pd.DataFrame(list_text)
    data.to_excel(str(x)+".xlsx")
    x+=1
    time.sleep(random.randint(1,4))
    if x%100==0:
        time.sleep(300)
        
   
    

 

 

 可视化脚本

# -*- coding: utf-8 -*-
"""
Created on Thu Jul 27 15:30:27 2017

@author: toby
"""
import copy 
import jieba
import pandas
import jieba.analyse
from wordcloud import WordCloud
import matplotlib.pyplot as plt

from matplotlib.font_manager import FontProperties
font = r'C:\Windows\Fonts\simfang.ttf'

filename="文本内容.txt"
 
def List_none_sense():
    file_noneSense_words=open("nonesense_words.txt",'r')
    list_noneSense_words=[]
    for line in file_noneSense_words:
        line_list=line.split()
        for word in line_list:
            list_noneSense_words.append(word)
    return list_noneSense_words

 #word遍历行,word处理,去除各种标点,怪异符号,最后计算word出现个数
#判断一个单词是否在垃圾词表里
def none_sense_words_Judeg(word):
    if word in list_noneSense_words:
        return True
    else:
        return False

#过滤停用词列表  
def filter_none_sense_words(list1):
    list2=[]
    for i in list1:
        #如果不在垃圾词库里或不是数字,汉字长度大于1
        if none_sense_words_Judeg(i[0])==False and i[0].isdigit()==False and len(i[0])>1:
            #print("remove",i)
            list2.append(i)
    return list2
    
    
#对文件分词
def fenci(filename) :
    f = open(filename,'rb')
    #所有文件保存为一个字符串
    file_list = f.read()
    #大写转换为小写
    file_list =file_list.lower()
    f.close()
    #分词
    seg_list = list(jieba.cut(file_list,cut_all=False))
    #生成一个字典,每个单词排名
    tf={}
    for seg in seg_list :
        #print seg
        seg = ''.join(seg.split())
        if (seg != '' and seg != "\n" and seg != "\n\n") :
            if seg in tf :
                tf[seg] += 1
            else :
                tf[seg] = 1
    return sorted(tf.items(),key=lambda item:item[1],reverse=True)
    '''
    f = open("result.txt","w+")
    for item in tf:
        #print item
        f.write(item+"  "+str(tf[item])+"\n")
    f.close()
    '''
    
#筛选排名大于三的    
def more3_filter(list1):
    list2=[]
    for i in list1:
        if i[1]>2:
            list2.append(i)
    return list2
    
'''    
#词语相似度
def Similarity(theList):
    newList=copy.deepcopy(theList)
    for i in range(len(newList)):
        if newList[i][0] in newList[i+1][0]:
            name=newList[i][0]
            value=newList[i][1]+newList[i+1][1]
            newItem=(name,value)
            print(newItem)
            del newList[i]
            del newList[i+1]
            newList.append(newItem)
    return newList
'''    
        
    
 
list_noneSense_words=List_none_sense()  
#分词    
list_words=fenci(filename)
#过滤停用词
list_words2=filter_none_sense_words(list_words)
list_words3=more3_filter(list_words2)
#前二十
list_words4=list_words3[:10]
for i in list_words4:
    print (i)
    

#写入数据到Excel,用pandas的df数据结构   
df=pandas.DataFrame(list_words2)
df.to_excel("result.xlsx")    
    
#标签云传输的frequency必须是字典形式,所以要转换
list_words3=dict(list_words3)

wc = WordCloud(collocations=False, font_path=font, width=1400, height=1400, margin=2).generate_from_frequencies(list_words3)

plt.imshow(wc)

plt.axis("off")
plt.show()
wc.to_file('show_Chinese.png')  # 把词云保存下来     

 

转载于:https://www.cnblogs.com/webRobot/p/7463384.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值