华为应用市场爬虫

本文介绍了一款针对华为应用商店的学习教育类别应用的爬虫程序实现细节。爬虫使用Python语言,结合BeautifulSoup和requests库,实现了从指定页面抓取应用信息,包括应用名称、开发者、下载次数等,并对数据进行解析、下载和存储。文章深入探讨了爬虫的各个功能模块,如获取页面列表、解析HTML、下载应用及存储数据。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# __author__ = 'blue'

from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from multiprocessing import Pool
import requests
import hashlib
import re
import os

total_page_nums = 6

apk_num_inpage = 0
apk_num_total = 0
apk_num_download = 0
PRINT_MSG = ''
page_url_list = []
base_url = 'http://appstore.huawei.com'
root = r'D:\T\1'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"}

# 华为搜索
def get_page_list(total_page_nums):
    for page_num in range(1, total_page_nums + 1, 1):
        # page_url = 'http://appstore.huawei.com/search/应用工具/{}'.format(page_num)  # 1.工具类   总页数为5
        page_url = 'http://appstore.huawei.com/search/学习教育/{}'.format(page_num)  # 1.学习教育类   总页数为6
        # page_url = 'http://appstore.huawei.com/search/%E5%BA%94%E7%94%A8%E5%B7%A5%E5%85%B7%E7%B1%BB/{}'.format(page_num)  # 5.应用工具类   总页数为4
        # page_url = 'http://appstore.huawei.com/search/%E7%BD%91%E7%BB%9C%E6%B8%B8%E6%88%8F/{}'.format(page_num)  # 6.网络游戏类   总页数为5
        page_url_list.append(page_url)
    return page_url_list


def parse_html(page_url_list):
    global PRINT_MSG
    global apk_num_total
    global apk_num_download
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
    }
    page_num = 0
    for apk_list_page_url in page_url_list:
        page_num = int(apk_list_page_url.split('/')[-1])
        print(" ----------========== 爬取第%s页开始 ==========----------" % str(page_num))
        # with open('D:\\T\\1\\wandoujia_BeatifulSoup.txt', 'a+', encoding='utf_8') as ft1:
        #     ft1.write(" ----------========== 爬取第%s页开始 ==========----------" % str(page_num) + '\n')
        try:
            response = requests.get(apk_list_page_url, headers=headers)
        except RequestException:
            print(RequestException.strerror)
        # 1.2设置编码格式
        if response.status_code != 200:
            continue
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, "lxml")
        # print(soup.prettify())
        links = soup.select("body > div.lay-body > div.lay-main > div.lay-left.corner > div.unit.nofloat > div.unit-main > div.list-game-app.dotline-btn.nofloat > div.game-info-ico > a")
        apk_num_inpage = 0
        for link in links:
            is_exsit = False
            file_list = []
            apk_num_inpage += 1
            response = requests.get(base_url + link.attrs["href"], headers=headers)
            if response.status_code == 200:
                response.encoding = 'utf-8'
                soup = BeautifulSoup(response.text, "lxml")
                # print(soup.prettify())
                appnames = soup.select('#bodyonline > div > div.lay-main > div.lay-left.hdn-x > div > div > div.app-info.flt > ul:nth-of-type(1) > li:nth-of-type(2) > p:nth-of-type(1) > span.title')[0].text
                # print(appnames)
                developers = soup.select("#bodyonline > div > div.lay-main > div.lay-left.hdn-x > div > div > div.app-info.flt > ul:nth-of-type(2) > li:nth-of-type(3) > span")
                # print(developers[0]['title'])
                apk_download_count = soup.select(
                    "#bodyonline > div > div.lay-main > div.lay-left.hdn-x > div > div > div.app-info.flt > ul:nth-of-type(1) > li:nth-of-type(2) > p:nth-of-type(1) > span.grey.sub")[0].text.split(':')[1]
                # print(apk_download_count)
                apk_download_url = soup.select('#bodyonline > div > div.lay-main > div.lay-left.hdn-x > div > div > div.app-function.nofloat > a')
                apk_download_url = re.findall('(http://.*?source=portalsite)', apk_download_url[0]['onclick'], 0)[0]
                # print(apk_download_url)
                apk_version = soup.select('#bodyonline > div > div.lay-main > div.lay-left.hdn-x > div > div > div.app-info.flt > ul:nth-of-type(2) > li:nth-of-type(4) > span')[0].text
                # if(len(developers) != len(appnames)):
                #     print('应用名个数 != 开发商个数')
                #     print("appnamesNum:" + str(len(appnames)) + appnames)
                #     print("developersNum:" + str(len(developers)))
                for file in os.listdir(r'D:\T\1'):
                    file_list.append(file)
                for apk_exsit in file_list:
                    if appnames in apk_exsit:
                        is_exsit = True
                if is_exsit:
                    print(appnames + "_V" + apk_version + "已经存在了")
                    continue
                download(appnames, apk_download_url, developers, apk_version, apk_download_count)
            else:
                print('获取应用详细页面失败!!!')
        print(" ----------========== 第%s页爬取完成,共%s个应用 ==========----------" % (str(page_num), str(apk_num_inpage)))
        # with open('D:\\T\\1\\wandoujia_BeatifulSoup.txt', 'a+', encoding='utf_8') as ft1:
        #     ft1.write(" ----------========== 第%s页爬取完成,共爬取%s个应用 ==========----------" % (str(page_num), str(apk_num_inpage)) + '\n')


def download(appnames, apk_download_url, developers, apk_version, apk_download_count):
    global apk_num_total
    global apk_num_download
    PRINT_MSG = ''
    apk_num_total = apk_num_total + 1
    if (len(apk_download_url) == 0):
        return
    print(str(apk_num_total) + '-' + appnames + '\t' + developers[0]['title'] + '\t' + apk_download_count)
    if appnames.find('上海') >= 0:
        response = requests.get(apk_download_url, headers=headers)
        if response.status_code == 200:
            apk_md5 = hashlib.md5(response.content).hexdigest()
            if len(developers) > 0:
                PRINT_MSG = developers[0]['title'] + '\t' + appnames + '\tV' + apk_version + '\t' + apk_md5 + '\t' + apk_download_count + '\t' + apk_download_url
            else:
                PRINT_MSG = '--** 不存在开发商信息 **--' + '\t' + appnames + '\tV' + apk_version + '\t' + apk_md5 + '\t' + apk_download_count + '\t' + apk_download_url
    elif len(developers) != 0 and (str(developers[0]['title']).find('上海') >= 0):
        response = requests.get(apk_download_url, headers=headers)
        if response.status_code == 200:
            apk_md5 = hashlib.md5(response.content).hexdigest()
            PRINT_MSG = developers[0]['title'] + '\t' + appnames + '\tV' + apk_version + '\t' + apk_md5 + '\t' + apk_download_count + '\t' + apk_download_url
    if (PRINT_MSG != ''):
        print(PRINT_MSG)
        apk_num_download += 1
        with open('D:\\T\\1\\huawei_zuche.txt', 'a+', encoding='utf_8') as ft1:
            ft1.write(PRINT_MSG + '\n')
            ft1.close()
        with open('D:\\T\\1\\%s_V%s.apk' % (appnames.strip().replace(' ', '').replace('|', '').replace('/', '').replace('\\', ''), apk_version), 'wb') as ft:
            # with open('D:\\T\\1\\%s_V%s.apk' % (appnames[i].text.strip().replace(' ', '').replace('|', '').replace('/', '').replace('\\', ''), apk_version[i].text), 'wb') as ft:
            ft.write(response.content)
            ft.close()


def main():
    if not os.path.exists(root):
        os.mkdir(root)
    url_page_list = get_page_list(total_page_nums)
    parse_html(page_url_list)
    # pool = Pool()
    # pool.map()


# 1.6 定义一个主程序入口
if __name__ == '__main__':
    main()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

点滴间积累无限

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值