Python爬虫

iFADA

于 2018-05-27 03:05:27 发布

阅读量1.8k

点赞数 6

CC 4.0 BY-SA版权

本文链接：https://blog.youkuaiyun.com/qq_41044525/article/details/80466586

本文介绍了一个使用Python进行网页内容抓取的案例，包括从指定网站爬取小说章节内容、从百度搜索并下载图片以及从猫眼电影网站抓取电影排行信息。通过这些实践案例，读者可以了解如何使用BeautifulSoup解析HTML，如何构造HTTP请求头避免被目标网站封禁，以及如何解析JSON数据。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

爬去网页小说

from bs4 import BeautifulSoup
import urllib.request
url = "http://www.jueshitangmen.info/tian-meng-bing-can-11.html"
html = urllib.request.urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(html,features='lxml')
#爬<p>中文字
all_p =soup.find_all('p')
for i in all_p:
    print('\n',i.get_text())

爬取百度图片

import urllib.request
import urllib.parse
import os
import re
#添加header，其中Referer是必须的,否则会返回403错误，User-Agent是必须的，这样才可以伪装成浏览器进行访问
header=\
{
     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
     "referer":"https://image.baidu.com"
    }
url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word={word}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&cg=girl&pn={pageNum}&rn=30&gsm=1e00000000001e&1490169411926="
keyword=input("请输入关键字")
keyword =urllib.parse.quote(keyword,'utf-8')#文字转码
n = 0#页数
j = 0#图片名字
error= 0#错误
while n <3000:
    n+=1
    #获取请求
    url1=url.format(word =keyword,pageNum = str(n))#获取索要查询的关键字
    rep  = urllib.request.Request(url1,headers=header)
    rep = urllib.request.urlopen(rep )
    #获取数据
    try:
        html = rep.read().decode('utf-8')
    except:
        print("数据出错")
        error = 1
        print("当前页是",str(n))
        if error ==1:
            continue
    #利用正则取出图片网址
    pattern = re.compile('thumbURL":"(.*?)"')
    data =re.findall(pattern,html)
    #设置下载位置
    if os.path.isdir("D://pictures/图片")!=True:
        os.makedirs(r"D://pictures/图片")
    #下载
    for i in data:
        print(i)
        urllib.request.urlretrieve(i,"D://pictures/图片/pic{num}.jpg".format(num =j))
        j+=1
        print("总爬取图片数"+str(j))

爬取猫眼电影Top100相关信息

import urllib.request
import urllib.parse
import re
import os
import json
from multiprocessing import Pool
import re


header=\
{
     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
     "referer":"https://image.baidu.com"
    }
#获取网页地址
def get_one_page(url):
        rep = urllib.request.Request(url, headers=header)
        rep = urllib.request.urlopen(rep)
        s = rep.read().decode("utf-8")
        return s
#利用正则找到需要的资源
def parse_one_page(html):
    pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a.*?>(.*?)'
                         + '</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?'
                         + '.*?fraction">(.*?)</i>.*?</dd>', re.S)
    items = re.findall(pattern, html)
    for item in items:
        yield {
            'index': item[0],
            'image': item[1],
            'title': item[2],
            'actor': item[3].strip()[3:],
            'time': item[4].strip()[5:],
            'score': item[5]+item[6]
        }
#写入文件
def writedata(data):
    with open("maoyan.txt",'a',encoding='utf-8')as f:
       f.write(json.dumps(data,ensure_ascii=False)+'\n')
def main(num):
    url = 'http://maoyan.com/board/4?offset='+str(num)
    html = get_one_page(url)
    parse_one_page(html)

    for item in parse_one_page(html):
      print(item)
      writedata(item)

if __name__ == '__main__':
    #创建线程池
   pool = Pool()
   pool.map(main,[i*10 for i in range(10)])