爬虫初探

本文介绍了使用Python进行网络爬虫的基本步骤,以煎蛋网的图片下载为例,展示了如何编写脚本实现批量下载。然而,由于网站的限制,脚本存在漏图现象,并且在处理输出信息时可能遇到文字编码问题。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

对煎·····蛋网的图片进行批量下载,但此脚本下载会漏图(与该网站的限制有关).输出信息时可能存在文字编码问题

#!usr/bin/env python
# -*- coding:utf-8 -*-

# URl:http://jandan.net/ooxx/page-+str(1-3000)+#comments
import multiprocessing
import os
import random
import urllib
import urllib2
from multiprocessing import Pool, Queue, cpu_count

import BeautifulSoup
import re
import requests


# r=requests.get("//ww3.sinaimg.cn/bmiddle/7c8e8afbjw1dh9yimwp4xj.jpg")
# urllib.urlopen("//ww3.sinaimg.cn/bmiddle/7c8e8afbjw1dh9yimwp4xj.jpg")

# 保存一张图片
import time
import socket

timeout = 10
socket.setdefaulttimeout(timeout)

#获得URL并保存图片
'''
def saveImg(imageURL, fileName):
    try:
        u = urllib2.urlopen(imageURL, timeout=10)
        data = u.read()
        f = open(fileName, 'wb')
        f.write(data)
        f.close()
    except:
        print u"图片地址有问题"
'''

def saveImg(imageURL,pageIndex,fileName):
    try:
        # 选择随机的User-Agent,以做辨别 Referer的作用
        user_agent = [
            'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
            'Opera/9.25 (Windows NT 5.1; U; en)',
            'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
            'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
            'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
            'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
            "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
            "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ", ]
        agent = {"User-Agent": random.choice(user_agent)}
        r = urllib2.build_opener()
        r.addheaders = [("User-agent", agent), ("Accept", "*/*"),
                        ('Referer', 'http://jandan.net/ooxx/page-' + str(pageIndex))]

        u = r.open(imageURL)

        # u = urllib2.urlopen(r)#, timeout=20)
        data = u.read()

        f = open(fileName, 'wb')
        f.write(data)
        f.close()
    except urllib2.HTTPError, e:
        print u"图片地址有问题,httpcode:%s" % e.code


# 获取图片存放URL
def getAllImgs(pageIndex):
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent': user_agent}
    url = "http://jandan.net/ooxx/page-" + str(pageIndex) + "#comments"
    # print url
    request = urllib2.Request(url, headers=headers)
    response = urllib2.urlopen(request)
    r = response.read()
    #print r
    # 调用BeautifulSoup库
    soup = BeautifulSoup.BeautifulSoup(r).prettify()
    # print soup
    # 格式化
    # s=soup.find(r"<span.*?righttext.*?<img src=(.*?)/>")
    pattern = re.compile('<div class="text">.*?<p.*?<img src="(.*?)"', re.S)
    images = re.findall(pattern, soup)
    return images   #返回list
    # for item in images:
    # print item
getAllImgs(2000)



# q=getAllImgs(1)

# print q


# for pageIndex in range(1,1000):
# q = getAllImg(1)
# print q


# 保存多张写真图片
def saveImgs(images, pageIndex=1):
    number = 1

    for imageURL in images:
        print imageURL
        splitPath = imageURL.split(".")
        fTail = splitPath.pop()
        fileName = str(pageIndex) + "/" + str(number) + "." + fTail
        if str(imageURL).startswith("h"):
            saveImg(imageURL, pageIndex,fileName)
        else:
            imageURL1 = "http:" + imageURL
            saveImg(imageURL1,pageIndex, fileName)
        number += 1

#以页数命名创建目录
def mkdir(pageIndex=1):
    path = str(pageIndex)  # .strip()
    # 判断路径是否存在
    # 存在     True
    # 不存在   False
    isExists = os.path.exists(path)
    # 判断结果
    if not isExists:
        # 如果不存在则创建目录
        print u"偷偷新建了名字叫做", path, u'的文件夹'
        # 创建目录操作函数
        os.makedirs(path)
        return True
    else:
        # 如果目录存在则不创建,并提示目录已存在
        print u"名为", path, '的文件夹已经创建成功'
        return False


# 保存一页的图片
def savePageInfo(pageIndex):
    # 获取一页图片的URL
    time.sleep(2)
    images = getAllImgs(pageIndex)
    # 创建保存目录,以页数命名
    mkdir(pageIndex)
    # 保存所有图像
    saveImgs(images, pageIndex)


def savePagesInfo(start, end):
    for i in range(start, end + 1):
        savePageInfo(i)

#利用进程池进行下载

def pool(start, end):
    if __name__ == "__main__":
        #startTime = time.time()
        p = Pool(cpu_count())     #cpu_count()
        list = []

        numlist = []
        startTime=time.time()
        '''
        for i in xrange(481,490):
            p = multiprocessing.Process(target=savePageInfo, args=(i,))
            numlist.append(p)
            p.start()
            p.join()
            print "process end."
            time.sleep(5)
        endTime = time.time()
        print u"used time is ", endTime - startTime

        print u"太阳出来爬山破咯喂!!!!!!!!!!!!!!!"

        '''
        for i in xrange(start, end + 1):
            list.append(i)
        result = p.map_async(savePageInfo, list)

        print "Waiting for all subprocess done..."
        flag = True
        while (flag):
            time.sleep(5)
            proces_count = len(p._cache)
            if proces_count != 0:
                print "%s processes running" % len(p._cache)
                print "开始下载图片的process %s" % os.getpid()

            else:
                print u"all processes are finished!!"
                flag = False

        p.close()
        p.join()

        endTime = time.time()
        print "xiazaihaole"
        print u"used time is ", endTime - startTime

pool(1,50)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值