递归抓取html页面且按目录结构保存页面的python爬虫

最新推荐文章于 2025-04-15 17:13:49 发布

pandaPwn

最新推荐文章于 2025-04-15 17:13:49 发布

阅读量2.7k

点赞数 1

分类专栏： python 文章标签： python requests lxml 爬虫

本文链接：https://blog.youkuaiyun.com/ZHUJIANWEILI4/article/details/83386547

版权

python 专栏收录该内容

3 篇文章

订阅专栏

本文介绍了一个使用Python的requests和lxml库进行网页抓取的实例，详细展示了如何设置请求头、处理cookies，以及如何解析HTML并递归地抓取链接。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

使用了python里的requests 和lxml库，这两个库可以通过pip install requests; pip install lxml来安装。

lxml官网为：https://lxml.de/installation.html，另一种安装lxml的方式为 apt-get install python-lxml

相应的代码为：

# -*- coding: utf-8 -*-
import os
import requests
from lxml import html

headers = {
    'Host': 'docs.qed-it.com',
    'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
    # !!!注意, 请求头部里使用gzip, 响应的网页内容不一定被压缩，这得看目标网站是否压缩网页
    'Accept-Encoding': 'gzip, deflate, sdch, br',
    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
    'Upgrade-Insecure-Requests': '1',
    #'Authorization': 'Basic cWVkdGVzdGVyOmc1djhLUlZjdXBwNA==', 
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
}

cookies = dict(Cookie='__cfduid=da58e11c538b3d7981894a4763b5bb4db1539678780; _ga=GA1.2.1749017230.1539678802; _gid=GA1.2.986082824.1540429323')

#store the crawled urls
crawled_urls = []

#set up file to save the content of the page
def save(text, filename='temp', path='output'):
    setup_dir(path)
    fpath = os.path.join(path, filename)
    with open(fpath, 'w') as  f:
        print('output:', fpath)
        f.write(text)
        f.close()

#get the completed url,e.g. http://www.xxx.com/xxx/yyy.html
def fix_url(prefix_url, url):
    ind = url.rfind(u'html')
    if ind != -1:
        ind += 4
    else:
        ind = -1
    url = url[0:ind]
    return prefix_url+url

#for recursivec crawl, so fix the preifix url
def fix_prefix_url(url):
    ind = url.rfind('/')
    url = url[0:ind]
    return url + '/'

#get the save path of the file
def get_path(suffix_url):
    ind = suffix_url.rfind('/')
    if ind != -1:
        path = suffix_url[0:ind]  
    else:
        path = ''
    return path

#get the file name from the pointed url
def fix_filename(suffix_url):
    ind0 = suffix_url.rfind('/')
    if ind0 != -1:
        ind0 += 1
    else:
        ind0 = 0
    ind1 = suffix_url.rfind('html')
    if ind1 != -1:
        ind1 += 4
    else:
        ind1 = -1

    filename = suffix_url[ind0:ind1]
    return filename

#just crawl the html page
def is_html(url):
    ind = url.rfind('html')
    if ind == -1:
        return False
    return True

#recursive crawl
#note:just crawl the pointed url with <a class="reference internal" />
def crawl(prefix_url, url, current_path):
    #just crawl the html page
    if not is_html(url):
       return
    #for recursive crawl, we should fix something each time, like the save path, prefix url e.g. 
    file_path = current_path + '/' + get_path(url)
    file_name = fix_filename(url)
    target_url = fix_url(prefix_url, url)
    prefix_url = fix_prefix_url(target_url)

    #judge the url if has been crawled or not
    if target_url in crawled_urls:
        print target_url + " has crawled"
        return

    #so we remove the crawled url by judge if the file has existed or not
    if os.path.exists(os.path.join(file_path, file_name)):
        print target_url + " has_crawled"
        return
    
    print "will crawl " + target_url

    crawled_urls.append(target_url)

    #visit the url
    resp = requests.get(target_url, cookies=cookies, auth=('qedtester', 'g5v8KRVcupp4'), verify=False)
    #get the content of the page from response
    #page = resp.content
    page = resp.text

    #note: the encode format
    page = page.encode('utf-8')
    #save page to file_path/file_name
    save(page, file_name, file_path)

    #parse page to get hyperlinks
    html_tree = html.fromstring(page)
    #use lxml to extract the hyperlinks
    target_urls = html_tree.xpath(u'//a[@class="reference internal"]')
    print "size of target_urls: " + str(len(target_urls))
    for target_url in target_urls:
        #file_name = target_url.text
        suffix_url = target_url.attrib['href']
        #print suffix_url.encode('utf-8')

        #recursive crawl
        crawl(prefix_url, suffix_url, file_path)

def setup_dir(dir_name):
    isExist = os.path.exists(dir_name)
    if not isExist:
        print "mkdir", dir_name
        os.makedirs(dir_name)
    else:
        print "%s existed" % dir_name

if __name__ == '__main__':
    print "Crawl is running..."
    prefix_url = 'https://docs.qed-it.com/docs/sdk-docs/en/latest/' 
    #the firsh page to crawl
    url = 'index.html'
    current_path = os.getcwd()
    crawl(prefix_url, url, current_path)
    print "Crawl comes back."