只需更改相应的前端寻找代码部分(下为丐版)
from bs4 import BeautifulSoup
import urllib.request
def getHtml(url):
resp = urllib.request.urlopen(url)
data = resp.read()
return data.decode("gbk")
def getOnePage(url):
nextUrl = ""
try:
html = getHtml(url)
soup = BeautifulSoup(html,"html.parser")
lis = soup.find("div",attrs = {"class":"con shoplist"}).find_all("li")
for li in lis:
title = li.find("a")["title"]
author = li.find("p",attrs={"class":"search_book_author"}).find("a")['title']
time = li.find("p",attrs={"class":"search_book_author"}).find_all("span")[1].text
publisher = li.find("p",attrs={"class":"search_book_author"}).find_all("span")[2].text
#author publisher puddate brief price
print("书名:",title)
print("作者:",author)
print("出版社:",publisher)
print("出版时间",time)
#print("简历")
#print("金额")
print(" ")
except Exception as err:
print(err)
return nextUrl
url= "http://search.dangdang.com/?key=python&act=input&page_index=1"
nextUrl = getOnePage(url)
以豆瓣top250为例子
from shutil import move
from bs4 import BeautifulSoup
import urllib.request
import time
import pandas as pd
#import response
'''
爬取单个电影需要信息内容
'''
'''检查改网页是否可以爬取'''
def getHtml(url):
proxy_ip='127.0.0.1:8888'
try:
req = urllib.request.Request(url)
# 对付网站反爬虫程序,有了以下的请求头文
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0')
# 此类是 URL 请求的抽象。
proxy=urllib.request.ProxyHandler({'http':proxy_ip})
# 返回一个OpenerDirector实例,它按照给定的顺序链接处理程序。handler可以是 的实例,也可以是 的BaseHandler子类
opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
# 安装一个OpenerDirector实例作为默认的全局开启器。
urllib.request.install_opener(opener)
# 打开 URL url,它可以是字符串或 Request对象。
data=urllib.request.urlopen(req).read().decode('utf-8','ignore')
return data
'''
except urllib.error.URLError as e:
if hasattr(e,'code'):
print(e.code)
if hasattr(e,'reason'):
print(e.reason)
time.sleep(10) #出现异常延时10s
'''
except Exception as e:
print('exception:'+str(e))
time.sleep(1)
'''获取每个电影页面的内容【名称+简介】'''
def getOnePage(url):
try:
# 调用可否爬取函数
html = getHtml(url)
# 调用bs对象来实现爬取
soup = BeautifulSoup(html,"lxml")
'''
1. 爬取内容
html嵌套如:related-info -> indent->all hidden
判错处理:有些简介太长,需要展开显示,
a. 先爬起隐藏内容,
b. 如果为空,则直接爬取页面未被隐藏内容
'''
# 前缀太长,所以分段爬取
movie_content_pre = soup.find("div", attrs={"class":"related-info"}).find("div",attrs={"class":"indent"})
# 简介的判错处理
try:
# movie_content = movie_content_pre.find("span",attrs={"class":"all hidden"}).text
movie_content = movie_content_pre.find("span",attrs={"class":"all hidden"}).text
except:
# movie_content = movie_content_pre.find("span",attrs={"property":"v:summary"}).text
movie_content = movie_content_pre.find("span",attrs={"property":"v:summary"}).text
print("all",movie_content)
movie_content = movie_content.strip()
movie_content = movie_content.replace('\u3000', '').replace('\r', '')
movie_content = movie_content.replace("\n","").replace(" ","")
print(" 1.movie content done")
'''
2. 爬取内容+类型
因为两者在同一个 <div id = "content" >下,
变量: information -> 存储content的内容
'''
'''
information = soup.find("div",attrs={"id":"content"})
# 一部电影包含多个标签,先用movie_type_all 包含所有标签内容
movie_type_all = information.find("div",attrs={"id":"info"}).find_all("span",attrs={"property":"v:genre"})
# 创建列表存储 类别 内容
movie_type = []
# 按照所有标签内容,遍历得到单个标签
for i in range(len(movie_type_all)):
types = movie_type_all[i].text
movie_type.append(types)
# 为写入文件而转换格式str
movie_type = str(movie_type)
print(" 2.movie type done")
'''
# 查找电影名称
movie_title = soup.find("div",attrs={"id":"content"}).find("span",attrs={"property":"v:itemreviewed"}).text
print(" 2.movie title done")
# print("电影名称\n",movie_title)
# print("电影简介\n",movie_type)
# print("电影内容\n",movie_content)
# 判错 遇错处理
except Exception as err:
print(err)
# 返回数据
return movie_title,movie_content
# # 检查某个电影页面内容是否爬取成功
url = "https://movie.douban.com/subject/1292001/"
getOnePage(url)
data = getOnePage(url)
with open("check.txt","w") as f:
f.write(str(data))
# 排行榜爬取下一个网页的内容(跳转爬取)
'''爬取下一个页面的所有链接内容'''
from one_page_film import a_page_to_all_links
def get_All_Pages_Links():
try:
# 参考学习链接:https://zhuanlan.zhihu.com/p/62601606
# 自定义url实现网址爬取
url_style = "https://movie.douban.com/top250?start={index}&filter="
url_lst = []
# 0-255 步长为25
for i in range(0, 250, 25):
url = url_style.format(index=i)
url_lst.append(url)
except Exception as err:
print(err)
return url_lst
# 存储10页的内容链接
url_list = get_All_Pages_Links()
# 环环嵌套 由页码链接得到每一页的电影链接
for i in url_list:
# flag
print("No: pages",i)
a_page_to_all_links(i)
# 在某一页排行榜爬取每个电影的链接
from bs4 import BeautifulSoup
import requests
# 从spider文件调用函数
from one_page_content import getHtml,getOnePage
'''对该页面的所有电影链接进行爬取'''
def getPageLink(url):
try:
# 调用函数
html = getHtml(url)
# 调用bs对象来实现爬取
soup = BeautifulSoup(html,"lxml")
# 一个电影名称在一个li标签里面
all_links = soup.find("ol",attrs={"class":"grid_view"}).find_all("li")
# 变量movie_link_list用来存储各个电影链接
movie_link_list = []
# 循环遍历li标签得到各个电影的链接
for i in range(len(all_links)):
link = all_links[i].find("div",attrs={"class":"pic"}).find("a")['href']
movie_link_list.append(link)
# print("存储在list的链接:\n",movie_link_list)
except Exception as err:
print(err)
return movie_link_list
def a_page_to_all_links(url):
# 一个页面的所有内容 存储在one_pages_content
one_pages_content = []
# 一个页面中所有的电影链接存储在one_page_link
one_page_link = getPageLink(url)
# flag
epoch = 0
for i in range(len(one_page_link)):
epoch = epoch + 1
print(" epoch:",epoch)
# 调用函数得到单个电影的内容
one_movie_content = getOnePage(one_page_link[i])
# print(one_movie_content)
print("name",one_movie_content[0])
print("content",one_movie_content[1])
# 写入语料库
with open("data/all_content_corpus.txt",'a',encoding="utf-8") as f:
f.write(one_movie_content[0])
f.write(",")
f.write(one_movie_content[1])
f.write("\n")
# url = "https://movie.douban.com/top250?start=0&filter="
# a_page_to_all_links(url)
本文介绍了如何使用Python爬取豆瓣电影Top250页面,包括获取电影名称、简介、类型等内容,并展示了如何处理反爬虫策略及数据存储。
1827





