可以用pandas读出之前保存的数据:
newsdf = pd.read_csv(r'F:\duym\gzccnews.csv')
一.把爬取的内容保存到数据库sqlite3
import sqlite3
with sqlite3.connect('gzccnewsdb.sqlite') as db:
newsdf.to_sql('gzccnews',con = db)
with sqlite3.connect('gzccnewsdb.sqlite') as db:
df2 = pd.read_sql_query('SELECT * FROM gzccnews',con=db)
保存到MySQL数据库
- import pandas as pd
- import pymysql
- from sqlalchemy import create_engine
- conInfo = "mysql+pymysql://user:passwd@host:port/gzccnews?charset=utf8"
- engine = create_engine(conInfo,encoding='utf-8')
- df = pd.DataFrame(allnews)
- df.to_sql(name = ‘news', con = engine, if_exists = 'append', index = False)
作为一名爱运动的男生,我选择爬取的是季后赛nba新闻
这是生成爬虫的代码
def creat_bs(url):
result = requests.get(url)
e=chardet.detect(result.content)['encoding']
#set the code of request object to the webpage's code
result.encoding=e
c = result.content
soup =BeautifulSoup(c,'lxml')
return soup
构建要获取网页的集合函数
def build_urls(prefix,suffix):
urls=[]
for item in suffix:
url=prefix+item
urls.append(url)
return urls
爬取函数
def find_title_link(soup):
titles=[]
links=[]
try:
contanier=soup.find('div',{'class':'container_padd'})
ajaxtable=contanier.find('form',{'id':'ajaxtable'})
page_list=ajaxtable.find_all('li')
for page in page_list:
titlelink=page.find('a',{'class':'truetit'})
if titlelink.text==None:
title=titlelink.find('b').text
else:
title=titlelink.text
if np.random.uniform(0,1)>0.90:
link=titlelink.get('href')
titles.append(title)
links.append(link)
except:
print('have no value')
return titles,links
保存数据
wordlist=str()
for title in title_group:
wordlist+=title
for reply in reply_group:
wordlist+=reply
def savetxt(wordlist):
f=open('wordlist.txt','wb')
f.write(wordlist.encode('utf8'))
f.close()
savetxt(wordlist)
生成的词云