【自用存档】bs4爬虫代码

原创已于 2022-08-16 16:08:34 修改 · 369 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#爬虫 #python #开发语言

于 2022-08-16 15:53:24 首次发布

本文介绍了如何使用Python爬取豆瓣电影Top250页面，包括获取电影名称、简介、类型等内容，并展示了如何处理反爬虫策略及数据存储。

只需更改相应的前端寻找代码部分(下为丐版)

from bs4 import BeautifulSoup
import urllib.request


def getHtml(url):
    resp = urllib.request.urlopen(url)
    data = resp.read()
    return data.decode("gbk")

def getOnePage(url):
    nextUrl = ""
    try:
        html = getHtml(url)
        soup = BeautifulSoup(html,"html.parser")
        lis = soup.find("div",attrs = {"class":"con shoplist"}).find_all("li")
  
        for li in lis:
            title = li.find("a")["title"]
            author = li.find("p",attrs={"class":"search_book_author"}).find("a")['title']
            time = li.find("p",attrs={"class":"search_book_author"}).find_all("span")[1].text
            publisher = li.find("p",attrs={"class":"search_book_author"}).find_all("span")[2].text
            
            #author publisher puddate brief price
            print("书名:",title)
            print("作者:",author)
            print("出版社：",publisher)
            print("出版时间",time)
            #print("简历")
            #print("金额")
            print("   ")
        
        
    except Exception as err:
        print(err)
    return nextUrl


url= "http://search.dangdang.com/?key=python&act=input&page_index=1"
nextUrl = getOnePage(url)

以豆瓣top250为例子

from shutil import move
from bs4 import BeautifulSoup
import urllib.request
import time
import pandas as pd 
#import response
'''
爬取单个电影需要信息内容
'''

'''检查改网页是否可以爬取'''
def getHtml(url):
    proxy_ip='127.0.0.1:8888'
    try:
        req = urllib.request.Request(url)
        # 对付网站反爬虫程序，有了以下的请求头文
        req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0')
        
        # 此类是 URL 请求的抽象。
        proxy=urllib.request.ProxyHandler({'http':proxy_ip})
        # 返回一个OpenerDirector实例，它按照给定的顺序链接处理程序。handler可以是 的实例，也可以是 的BaseHandler子类
        opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
        # 安装一个OpenerDirector实例作为默认的全局开启器。
        urllib.request.install_opener(opener)
        # 打开 URL url，它可以是字符串或 Request对象。
        data=urllib.request.urlopen(req).read().decode('utf-8','ignore')
        
        return data
        '''
        except urllib.error.URLError as e:
            if hasattr(e,'code'):
                print(e.code)
            if hasattr(e,'reason'):
                print(e.reason)
            time.sleep(10) #出现异常延时10s
        '''
    except Exception as e:
        print('exception:'+str(e))
        time.sleep(1)



'''获取每个电影页面的内容【名称+简介】'''
def getOnePage(url):
    try:
        # 调用可否爬取函数
        html = getHtml(url)

        # 调用bs对象来实现爬取 
        soup = BeautifulSoup(html,"lxml")

        '''
        1. 爬取内容 
            html嵌套如：related-info -> indent->all hidden
            判错处理：有些简介太长，需要展开显示，
                a. 先爬起隐藏内容，
                b. 如果为空，则直接爬取页面未被隐藏内容
        '''
        # 前缀太长，所以分段爬取
        movie_content_pre = soup.find("div", attrs={"class":"related-info"}).find("div",attrs={"class":"indent"})
      
        # 简介的判错处理
        try:
            # movie_content = movie_content_pre.find("span",attrs={"class":"all hidden"}).text
            movie_content = movie_content_pre.find("span",attrs={"class":"all hidden"}).text
        except:
            # movie_content = movie_content_pre.find("span",attrs={"property":"v:summary"}).text
            movie_content = movie_content_pre.find("span",attrs={"property":"v:summary"}).text
        print("all",movie_content)
    
        
        movie_content = movie_content.strip()
        movie_content = movie_content.replace('\u3000', '').replace('\r', '')
        movie_content = movie_content.replace("\n","").replace("  ","")
        print("     1.movie content done")
        
      
        '''
        2. 爬取内容+类型
            因为两者在同一个 <div id = "content" >下，
            变量: information -> 存储content的内容
        '''
        '''
        information = soup.find("div",attrs={"id":"content"})
        # 一部电影包含多个标签，先用movie_type_all 包含所有标签内容
        movie_type_all = information.find("div",attrs={"id":"info"}).find_all("span",attrs={"property":"v:genre"})
        # 创建列表存储 类别 内容
        movie_type = []
        # 按照所有标签内容，遍历得到单个标签
        for i in range(len(movie_type_all)):
            types = movie_type_all[i].text
            movie_type.append(types)
        
        # 为写入文件而转换格式str
        movie_type = str(movie_type)
        
        print("     2.movie type done")
        '''
      
        # 查找电影名称
        movie_title = soup.find("div",attrs={"id":"content"}).find("span",attrs={"property":"v:itemreviewed"}).text
        print("     2.movie title done")
      
        # print("电影名称\n",movie_title)
        # print("电影简介\n",movie_type)
        # print("电影内容\n",movie_content)    
    

    # 判错 遇错处理
    except Exception as err:
        print(err)


    # 返回数据
    return movie_title,movie_content


# # 检查某个电影页面内容是否爬取成功
url = "https://movie.douban.com/subject/1292001/"
getOnePage(url)

data = getOnePage(url)
with open("check.txt","w") as f:
    f.write(str(data))

#   排行榜爬取下一个网页的内容（跳转爬取）
'''爬取下一个页面的所有链接内容'''
from one_page_film import a_page_to_all_links

def get_All_Pages_Links():
    try:
        # 参考学习链接：https://zhuanlan.zhihu.com/p/62601606
        # 自定义url实现网址爬取
        url_style = "https://movie.douban.com/top250?start={index}&filter="
        url_lst = []
        # 0-255 步长为25 
        for i in range(0, 250, 25):
            url = url_style.format(index=i)
            url_lst.append(url)
       
    except Exception as err:
        print(err)
    return url_lst
    
# 存储10页的内容链接
url_list = get_All_Pages_Links()
# 环环嵌套 由页码链接得到每一页的电影链接
for i in url_list:  
    # flag  
    print("No: pages",i)
    a_page_to_all_links(i)

# 在某一页排行榜爬取每个电影的链接
from bs4 import BeautifulSoup
import requests
# 从spider文件调用函数
from one_page_content import getHtml,getOnePage

'''对该页面的所有电影链接进行爬取'''
def getPageLink(url):
    try:
        # 调用函数
        html = getHtml(url)
        # 调用bs对象来实现爬取 
        soup = BeautifulSoup(html,"lxml")

        # 一个电影名称在一个li标签里面
        all_links = soup.find("ol",attrs={"class":"grid_view"}).find_all("li")
        
        # 变量movie_link_list用来存储各个电影链接
        movie_link_list = []
        # 循环遍历li标签得到各个电影的链接
        for i in range(len(all_links)):
            link = all_links[i].find("div",attrs={"class":"pic"}).find("a")['href']
            movie_link_list.append(link)
            
        # print("存储在list的链接:\n",movie_link_list)
       
    except Exception as err:
        print(err)

    return movie_link_list



def a_page_to_all_links(url):
    # 一个页面的所有内容 存储在one_pages_content
    one_pages_content = []
    # 一个页面中所有的电影链接存储在one_page_link
    one_page_link = getPageLink(url)
    # flag 
    epoch = 0
    for i in range(len(one_page_link)):
        epoch = epoch + 1
        print(" epoch:",epoch)

        # 调用函数得到单个电影的内容
        one_movie_content = getOnePage(one_page_link[i])
        
        # print(one_movie_content)
        print("name",one_movie_content[0])
        print("content",one_movie_content[1])
        
        # 写入语料库
        with open("data/all_content_corpus.txt",'a',encoding="utf-8") as f:  
            f.write(one_movie_content[0])
            f.write(",")     
            f.write(one_movie_content[1])
            f.write("\n")
            
       
# url = "https://movie.douban.com/top250?start=0&filter="   
# a_page_to_all_links(url)

【自用存档】bs4爬虫代码

1 条评论