Python爬虫入门整理

第一个例子

在pycharm下安装第三方库(requests和BeautifulSoup4),由于使用的是Python3.5,安装BeautifulSoup4支持更好(BeautifulSoup3已经停止开发)
import requests
from bs4 import BeautifulSoup
r = requests.get('http://www.wise.xmu.edu.cn/people/faculty')
html = r.content
soup = BeautifulSoup(html,'html.parser')    #html.parser是解析器
div_people_list = soup.find('div', attrs={'class': 'people_list'})
a_s = div_people_list.find_all('a', attrs={'target': '_blank'})
for a in a_s:
    url = a['href']
    name = a.get_text()
    print name,url

https://zhuanlan.zhihu.com/p/21377121?refer=xmucpp

第二个例子

与第一个例子类似,主要是增加了审查元素,快捷键'ctrl+u'打开源文件,F12打开检查元素
import requests
from bs4 import BeautifulSoup


def getHTML(url):
    r = requests.get(url)
    return r.content

def parseHTML(html):
    soup = BeautifulSoup(html,'html.parser')

    body = soup.body
    company_middle = body.find('div',attrs={'class':'middle'})
    company_list_ct = company_middle.find('div',attrs={'class':'list-ct'})

    for company_ul in company_list_ct.find_all('ul',attrs={'class':'company-list'}):
        for company_li in company_ul.find_all('li'):
            company_url = company_li.a['href']
            company_info = company_li.get_text()
            print company_info,company_url


URL = 'http://www.cninfo.com.cn/cninfo-new/information/companylist'
html = getHTML(URL)
parseHTML(html)

https://zhuanlan.zhihu.com/p/21442500

第三个例子

主要是增加了csv存储,with...as...上下文管理协议,要注意编码方式
# -*- coding: utf-8 -*-
import codecs
import csv

import requests
from bs4 import BeautifulSoup


def getHTML(url):
    r = requests.get(url)
    return r.content


def parseHTML(html):
    soup = BeautifulSoup(html,'html.parser')

    body = soup.body
    company_middle = body.find('div',attrs={'class':'middle'})
    company_list_ct = company_middle.find('div',attrs={'class':'list-ct'})

    company_list = []    #修改
    for company_ul in company_list_ct.find_all('ul',attrs={'class':'company-list'}):
        for company_li in company_ul.find_all('li'):
            company_url = company_li.a['href']
            company_info = company_li.get_text()
            company_list.append([company_info.encode('utf-8'),company_url.encode('utf-8')])    #修改

    return company_list    #修改


def writeCSV(file_name,data_list):
    with codecs.open(file_name,'wb') as f:
        writer = csv.writer(f)
        for data in data_list:
            writer.writerow(data)


URL = 'http://www.cninfo.com.cn/cninfo-new/information/companylist'
html = getHTML(URL)
data_list = parseHTML(html)    #修改
writeCSV('test.csv',data_list)

https://zhuanlan.zhihu.com/p/21452812

第四个例子

抓取豆瓣电影
#!/usr/bin/env python
# encoding=utf-8

"""
爬取豆瓣电影TOP250 - 完整示例代码
"""

import codecs

import requests
from bs4 import BeautifulSoup

DOWNLOAD_URL = 'http://movie.douban.com/top250/'


def download_page(url):
    return requests.get(url, headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
    }).content


def parse_html(html):
    soup = BeautifulSoup(html)
    movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'})

    movie_name_list = []

    for movie_li in movie_list_soup.find_all('li'):
        detail = movie_li.find('div', attrs={'class': 'hd'})
        movie_name = detail.find('span', attrs={'class': 'title'}).getText()

        movie_name_list.append(movie_name)

    next_page = soup.find('span', attrs={'class': 'next'}).find('a')
    if next_page:
        return movie_name_list, DOWNLOAD_URL + next_page['href']
    return movie_name_list, None


def main():
    url = DOWNLOAD_URL

    with codecs.open('movies', 'wb', encoding='utf-8') as fp:
        while url:
            html = download_page(url)
            movies, url = parse_html(html)
            fp.write(u'{movies}\n'.format(movies='\n'.join(movies)))


if __name__ == '__main__':
    main()

https://xlzd.me/2015/12/16/python-crawler-03

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值