Python在数据新闻中的应用
授课教师:北京师范大学新闻传播学院 吴晔
一、Python在数据新闻中的作用
二、Python爬虫
1 最简单爬虫—爬单页
import requests
from bs4 import BeautifulSoup
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
'Connection': 'keep-alive',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}
url = 'http://xcy.hubu.edu.cn/szqk/jsdw.htm'
data = requests.get(url,headers=header).content.decode('utf8')
soup = BeautifulSoup(data,'html.parser')
contentdiv = soup.find_all('div',{'class':'split'})
name = contentdiv[0].text
authordiv = soup.find_all('div',{'class':'infoArea'})
author = authordiv[0].find_all('span')
newsfrom = author[0].text
newsauthor = author[1].text
2 采集单页目录
#采集单页URL目录 首页 -> 师资情况 -> 教师队伍
import requests
from bs4 import BeautifulSoup
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
'Connection': 'keep-alive',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}
url = 'http://xcy.hubu.edu.cn/szqk/jsdw.htm'
data = requests.get(url,headers=header).content.decode('utf8')
soup = BeautifulSoup(data,'html.parser')
urldiv = soup.find_all('div',{'class':'listguid'}) #我们院的快速导航,class名字竟然是右侧导航,哈哈哈哈
url_all=[]
for urltemp in urldiv:
url = urltemp.a['href']
url_all.append(url)
3 采集多页
#采集多页URL
#特点是页面下面是1234567导航,样本为学堂在线
import requests
from bs4 import BeautifulSoup
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
'Connection': 'keep-alive',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}
url_path = 'http://www.xuetangx.com/courses?credential=0&page_type=0&cid=117&process=0&org=0&course_mode=0&page=' #
url_all=[]
for i in range(12):
print(i)
url = url_path + str(i+1)
data = requests.get(url,headers=header).content.decode('utf8')
soup = BeautifulSoup(data,'html.parser')
urldiv = soup.find_all('div',{'class':'coursename'})
for urltemp in urldiv:
url_find = urltemp.a['href']
url_all.append(url_find)