Python 爬取单个网页所需要加载的URL地址和CSS、JS文件地址

直接上代码:

脱敏后自用的py采集代码,

#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author:Andy
@file:xxx.py
@time:下午05:50
@desc:采集的文章数据进博客
"""
import os
import re
import time
import requests
from bs4 import BeautifulSoup, SoupStrainer
from requests.exceptions import RequestException
from hashlib import md5
from urllib.parse import urlparse
import urllib

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}


def get_content():
    url = 'http://ask.xxxx.com/question/xxxx'  # url
    response = requests.get(url, headers=headers).text.replace('<i class="fa fa-paperclip"></i>', '')
    soup = BeautifulSoup(response, 'lxml')
    # div = soup.select('#aw-mod-body ueditor-p-reset')
    pattern = re.compile('<a\shref="(http://ask.apelearn.com/file.*?)".*?>(.*?)</a>', re.S)
    p = soup.find_all('a')
    for item in p:
        # print(str(item))
        result = re.findall(pattern, str(item))
        if result:
            # print(result)
            for i in result:
                url, name = i
                # print(i)
                yield {
                    'url': url,
                    'name': name
                }


def mkdir(path):
    # 去除首位空格
    path=path.strip()
    # 去除尾部 \ 符号
    path=path.rstrip("\\")
    # 判断路径是否存在
    # 存在     True
    # 不存在   False
    isExists=os.path.exists(path)
    # 判断结果
    if not isExists:
        # 如果不存在则创建目录
        # 创建目录操作函数
        os.makedirs(path)
        print(path+' 创建成功')
        return True
    else:
        # 如果目录存在则不创建,并提示目录已存在
        print(path+' 目录已存在')
        return False

def getUrl(html):
    #patterncss = '<link href="(.*?)"'
    patternjs = '<script src="(.*?)"'
    patternimg = '<img src="(.*?)"'
    #href = re.compile(patterncss, re.S).findall(html)
    href = re.compile(patternimg, re.S).findall(html)
    href += re.compile(patternjs, re.S).findall(html)
    return href

def getCssUrl(html):
    patterncss = '<link href="(.*?)"'
    href = re.compile(patterncss, re.S).findall(html)
    return href

# 下载网页
def download_html(root_path, url):
    a = urlparse(url)
    file_path = a.path
    file_name = os.path.basename(file_path)
    _, file_suffix = os.path.splitext(file_name)
    if file_suffix != '.html':
        file_name_real = file_name + '.html'
    else:
        file_name_real = file_name
    file_path_real = file_path.replace(file_name, '')
    file_path_reals = file_path_real.replace('/', "\\")
    all_file_path_real = root_path + file_path_reals + file_name_real
    headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    re = requests.get(url, headers = headers)
    re.encoding = "utf-8"

    itemurl = getUrl(re.text)
    for item1 in itemurl:
        download_commonimgjs(root_path, item1)

    itemcssurl = getCssUrl(re.text)
    for item2 in itemcssurl:
        download_css(root_path, item2)

    new_text = re.text.replace('https://www.xxxxxx.com', 'http://www.xxxxx.com')
    new_texts = new_text.replace('xxxxxx.com', '3cinno.shanhubei.com')
    with open(all_file_path_real, "w+", encoding="utf-8") as html_file:
        html_file.write(new_texts)

def download_commonimgjs(root_path, url):
    if str(url[:1]) == r"/":
        imgurl = "https://www.xxxxxx.com" + url
    else:
        imgurl = url
    a = urlparse(imgurl)
    file_path = a.path
    file_name = os.path.basename(file_path)
    _, file_suffix = os.path.splitext(file_name)
    # print(os.path.curdir(file_path))
    match_url = file_path.replace(file_name, '')
    match_url_new = match_url.replace('/', "\\")
    newmkpath = root_path + match_url_new
    if os.path.isfile(newmkpath + file_name):
        return
    # 调用函数
    mkdir(newmkpath)
    try:
        opener = urllib.request.build_opener()
        opener.addheaders = [('User-agent',
                              'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
        urllib.request.install_opener(opener)
        urllib.request.urlretrieve(imgurl, newmkpath + file_name)
    except urllib.error.HTTPError:
        print('error')



def download_img(root_path, url):
    if str(url[:1]) == r"/":
        imgurl = "https://www.xxxxxx.com" + url
    else:
        imgurl = url
    a = urlparse(imgurl)
    file_path = a.path
    file_name = os.path.basename(file_path)
    _, file_suffix = os.path.splitext(file_name)
    # print(os.path.curdir(file_path))
    match_url = file_path.replace(file_name, '')
    match_url_new = match_url.replace('/', "\\")
    newmkpath = root_path + match_url_new
    # 调用函数
    mkdir(newmkpath)
    opener = urllib.request.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
    urllib.request.install_opener(opener)
    urllib.request.urlretrieve(imgurl, newmkpath + file_name)

def download_js(root_path, url):
    if str(url[:1]) == r"/":
        imgurl = "https://www.xxxxxx.com" + url
    else:
        imgurl = url
    a = urlparse(imgurl)
    file_path = a.path
    file_name = os.path.basename(file_path)
    _, file_suffix = os.path.splitext(file_name)
    # print(os.path.curdir(file_path))
    match_url = file_path.replace(file_name, '')
    match_url_new = match_url.replace('/', "\\")
    newmkpath = root_path + match_url_new
    # 调用函数
    mkdir(newmkpath)
    opener = urllib.request.build_opener()
    opener.addheaders = [('User-agent',
                          'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
    urllib.request.install_opener(opener)
    urllib.request.urlretrieve(imgurl, newmkpath + file_name)

def download_css(root_path, url):
    if str(url[:1]) == r"/":
        imgurl = "https://www.xxxxxx.com" + url
    else:
        imgurl = url
    a = urlparse(imgurl)
    file_path = a.path
    file_name = os.path.basename(file_path)
    _, file_suffix = os.path.splitext(file_name)
    if file_suffix != '.css':
        return
    # print(os.path.curdir(file_path))
    match_url = file_path.replace(file_name, '')
    match_url_new = match_url.replace('/', "\\")
    newmkpath = root_path + match_url_new
    if os.path.isfile(newmkpath + file_name):
        return
    # 调用函数
    mkdir(newmkpath)
    try:
        opener = urllib.request.build_opener()
        opener.addheaders = [('User-agent',
                              'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
        urllib.request.install_opener(opener)
        urllib.request.urlretrieve(imgurl, newmkpath + file_name)
    except urllib.error.HTTPError:
        print('error')

def get_xml():
    url = 'https://www.xxxxxx.com/sitemap-1.xml'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    res = requests.get(url, headers=headers)
    res.encoding = "utf-8"
    # 根据你的文章链接格式写正则匹配,可能与我的不一样
    r = re.compile(r'https://www.xxxxxx.com/\S*?')
    big = re.findall(r, res.text)
    for i in big:
        print(i)


def main():
    # get_content()
    # url = r'https://www.xxxxxx.com/news/xxxx-proje-20711498'
    url = r'https://www.xxxxxx.com/uploads/20218080/logo202107221507387902092.png'
    # 定义要创建的目录
    root_path = "F:\\Project-cz\\shanhubei\\3cinno"
    #download_img(root_path, url)

    #htmlurl = r'https://www.xxxxxx.com/3d-clear-led-dmx-ball'
    #download_html(root_path, htmlurl)

    cssurl = r'https://www.xxxxxx.com/images/m184/black/style.css'
    #download_css(root_path, cssurl)

    #demourl = 'https://www.xxxxxx.com/Content/kcim/js/layim-public.js?t=20190404'
    #demo(demourl)

    get_xml()


def demo(url):
    a = urlparse(url)
    file_path = a.path
    print(a.scheme)
    print(a.hostname)
    print('a.file_path=' + file_path)
    file_name = os.path.basename(file_path)
    print('file_name=' +file_name)
    _, file_suffix = os.path.splitext(file_name)
    print('a.file_suffix=' + file_suffix)





if __name__ == '__main__':
    main()

 来源:http://www.shanhubei.com/archives/2491.html

<think>好的,用户想用Python编写爬虫来抓取WordPress网站的数据。首先,我需要回顾一下用户提供的引用内容,看看里面有没有有用的信息。引用2提到WordPress.org有自己的API,可以方便地获取插件、主题等信息,避免直接爬取网页。引用3则提到robots.txt禁止爬虫,需要关闭Scrapy的遵守协议设置。这可能涉及到绕过robots.txt,但需要注意法律道德问题。 接下来,我应该分步骤来回答用户的问题。首先,使用官方API是比较推荐的方法,因为更高效且不会给服务器带来负担。需要检查API的文档,看看有哪些可用的接口,比如获取插件列表、详情等。然后,用Python的requests库来调用这些API,解析返回的JSON数据。 如果用户需要的数据不在API中,可能需要直接爬取网页。这时候需要考虑使用Scrapy或BeautifulSoup。根据引用3,可能需要关闭Scrapy的ROBOTSTXT_OBEY设置。但必须提醒用户遵守网站规则,避免法律风险。 另外,引用1提到Python的应用广泛,包括爬虫,所以可以推荐用户学习相关的基础知识,比如HTTP请求、HTML解析、数据处理等。同时,要强调道德爬虫的重要性,比如设置合理的请求间隔,避免被封IP。 可能用户遇到的问题包括反爬机制,比如验证码、动态加载内容等。这时候可能需要使用Selenium或Scrapy-Splash来处理JavaScript渲染的页面。同时,保存数据可以用CSV、JSON或数据库,根据用户需求选择合适的存储方式。 最后,生成相关问题需要考虑用户可能遇到的后续问题,比如如何处理登录后的页面、如何应对网站结构变化、如何优化爬虫性能等。确保用户有一个全面的解决方案,并了解潜在的风险挑战。</think>### 分步解答:用Python实现WordPress网页数据爬取 #### 一、优先使用WordPress官方API(推荐方法) 1. **获取API接口地址** WordPress.org提供了公开的API接口,例如插件信息接口为: `https://api.wordpress.org/plugins/info/1.0/{插件名称}.json` 或批量获取插件列表: `https://api.wordpress.org/plugins/update-check/1.1/`[^2] 2. **Python调用API示例** ```python import requests # 获取单个插件信息 plugin_name = "akismet" response = requests.get(f"https://api.wordpress.org/plugins/info/1.0/{plugin_name}.json") data = response.json() print(f"插件作者:{data['author']},下载量:{data['downloaded']}") # 获取插件列表(需构造POST请求) payload = {"action": "query_plugins", "request": {"per_page": 100}} response = requests.post("https://api.wordpress.org/plugins/update-check/1.1/", json=payload) plugins_list = response.json()['plugins'] ``` #### 二、网页爬取方法(需谨慎) 1. **绕过robots.txt限制** 若必须爬取网页,需在Scrapy中修改设置: ```python # settings.py ROBOTSTXT_OBEY = False # 关闭遵守robots协议[^3] ``` 2. **使用BeautifulSoup解析页面** ```python import requests from bs4 import BeautifulSoup url = "https://wordpress.org/plugins/" response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') # 提取插件名称链接 plugins = soup.select('article.plugin-card') for plugin in plugins: name = plugin.select_one('.entry-title a').text.strip() link = plugin.select_one('.entry-title a')['href'] print(f"{name}: {link}") ``` #### 三、注意事项 1. **遵守道德规范** - 添加请求头模拟浏览器访问: ```python headers = {'User-Agent': 'Mozilla/5.0'} requests.get(url, headers=headers) ``` - 设置请求间隔(建议≥2秒) - 避免高频访问导致服务器压力 2. **处理动态内容** 对使用JavaScript渲染的页面,需配合Selenium: ```python from selenium import webdriver driver = webdriver.Chrome() driver.get("https://wordpress.org/themes/") dynamic_content = driver.find_element_by_css_selector('.theme-list') ``` #### 四、数据存储 ```python import csv with open('plugins.csv', 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(['名称', '作者', '下载量']) writer.writerow([data['name'], data['author'], data['downloaded']]) ```
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值