第一个例子
在pycharm下安装第三方库(requests和BeautifulSoup4),由于使用的是Python3.5,安装BeautifulSoup4支持更好(BeautifulSoup3已经停止开发)
import requests
from bs4 import BeautifulSoup
r = requests.get('http://www.wise.xmu.edu.cn/people/faculty')
html = r.content
soup = BeautifulSoup(html,'html.parser')
div_people_list = soup.find('div', attrs={'class': 'people_list'})
a_s = div_people_list.find_all('a', attrs={'target': '_blank'})
for a in a_s:
url = a['href']
name = a.get_text()
print name,url
https://zhuanlan.zhihu.com/p/21377121?refer=xmucpp
第二个例子
与第一个例子类似,主要是增加了审查元素,快捷键'ctrl+u'打开源文件,F12打开检查元素
import requests
from bs4 import BeautifulSoup
def getHTML(url):
r = requests.get(url)
return r.content
def parseHTML(html):
soup = BeautifulSoup(html,'html.parser')
body = soup.body
company_middle = body.find('div',attrs={'class':'middle'})
company_list_ct = company_middle.find('div',attrs={'class':'list-ct'})
for company_ul in company_list_ct.find_all('ul',attrs={'class':'company-list'}):
for company_li in company_ul.find_all('li'):
company_url = company_li.a['href']
company_info = company_li.get_text()
print company_info,company_url
URL = 'http://www.cninfo.com.cn/cninfo-new/information/companylist'
html = getHTML(URL)
parseHTML(html)
https://zhuanlan.zhihu.com/p/21442500
第三个例子
主要是增加了csv存储,with...as...上下文管理协议,要注意编码方式
import codecs
import csv
import requests
from bs4 import BeautifulSoup
def getHTML(url):
r = requests.get(url)
return r.content
def parseHTML(html):
soup = BeautifulSoup(html,'html.parser')
body = soup.body
company_middle = body.find('div',attrs={'class':'middle'})
company_list_ct = company_middle.find('div',attrs={'class':'list-ct'})
company_list = []
for company_ul in company_list_ct.find_all('ul',attrs={'class':'company-list'}):
for company_li in company_ul.find_all('li'):
company_url = company_li.a['href']
company_info = company_li.get_text()
company_list.append([company_info.encode('utf-8'),company_url.encode('utf-8')])
return company_list
def writeCSV(file_name,data_list):
with codecs.open(file_name,'wb') as f:
writer = csv.writer(f)
for data in data_list:
writer.writerow(data)
URL = 'http://www.cninfo.com.cn/cninfo-new/information/companylist'
html = getHTML(URL)
data_list = parseHTML(html)
writeCSV('test.csv',data_list)
https://zhuanlan.zhihu.com/p/21452812
第四个例子
抓取豆瓣电影
"""
爬取豆瓣电影TOP250 - 完整示例代码
"""
import codecs
import requests
from bs4 import BeautifulSoup
DOWNLOAD_URL = 'http://movie.douban.com/top250/'
def download_page(url):
return requests.get(url, headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
}).content
def parse_html(html):
soup = BeautifulSoup(html)
movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'})
movie_name_list = []
for movie_li in movie_list_soup.find_all('li'):
detail = movie_li.find('div', attrs={'class': 'hd'})
movie_name = detail.find('span', attrs={'class': 'title'}).getText()
movie_name_list.append(movie_name)
next_page = soup.find('span', attrs={'class': 'next'}).find('a')
if next_page:
return movie_name_list, DOWNLOAD_URL + next_page['href']
return movie_name_list, None
def main():
url = DOWNLOAD_URL
with codecs.open('movies', 'wb', encoding='utf-8') as fp:
while url:
html = download_page(url)
movies, url = parse_html(html)
fp.write(u'{movies}\n'.format(movies='\n'.join(movies)))
if __name__ == '__main__':
main()
https://xlzd.me/2015/12/16/python-crawler-03