python3 爬取乐谱

最新推荐文章于 2025-10-30 13:40:16 发布

原创最新推荐文章于 2025-10-30 13:40:16 发布 · 3.1k 阅读

12 ·

CC 4.0 BY-SA版权

文章标签：

#python应用 #爬虫

小程序专栏收录该内容

4 篇文章

订阅专栏

部署运行你感兴趣的模型镜像

最近感觉找乐谱下载之类比较麻烦，于是花了两天时间写了个爬虫程序，效率倍增。

使用方法：在https://www.17jita.com上搜索乐谱，点开第一个界面，将网址按程序内格式赋给url_sa下，然后运行，便可以在D：/pic/目录下获得乐谱。

效果如下：

源码(日后有时间会继续完善):

import urllib.request
import re
import os
from bs4 import BeautifulSoup

#database
recipeWeb = "https://www.17jita.com/"
recipeWeb_len = len(recipeWeb)

webList = []
FinalList = []

url_sa = "tab/img/4289"
url = recipeWeb + url_sa + '.html'


def getList(url):
    data = getData(url)
    pre = r'href='
    post = r'.*?html'
    webList = findPreAndPost(pre, post , data) 
    return webList
    
def findPreAndPost(pre, post, data):
    loc = []
    res = []
    length = re.split(r'\s', data)
    for i in length:
        if(re.match(pre,i)):
            loc.append(i)
    for i in loc:
        if(re.match(post,i)):
            res.append(i)
    return res


def getData(url):
    webPage = urllib.request.urlopen(url)
    data = webPage.read()
    data = data.decode('gbk')
    return data


def GetPic(url, cnt):
    data = getData(url)
    soup = BeautifulSoup(data, 'html.parser')

    length = re.split(r'\s', data)

    title = soup.find('h1', {'class': 'ph'}).text
    loc = []
    res = []

    for i in length:
        if(re.match(r'src=',i)):
            loc.append(i)

    for i in loc:
        if(re.match(r'.*?png',i)):
            res.append(i)

    for pic in res:
        tmp = re.search(r'src="(.*?)"',pic)
        pic_url = tmp.group(1)
        print(pic_url)
        if(pic_url[0]=='/' or pic_url[0]!='h'): continue;
        pic_web = urllib.request.urlopen(pic_url)
        pic_data = pic_web.read()
        path = "d:/pic/"+title+'/';
        if not os.path.exists(path):
           os.makedirs(path)
        f = open(path+title+str(cnt)+".png","wb")
        f.write(pic_data)
        f.close()
        print(pic)


webList = getList(url)
FinalList.append(url)
for i in webList:
   cnt = 0
   searchStr = 'href="'+ url_sa + '(.*?)"'
   tmp = re.search(searchStr, i)
   if tmp:
     ans  = tmp.group(1)
     W = recipeWeb+url_sa+ans
     if not W in FinalList:
        FinalList.append(W)


cnt = 0
for web_url in FinalList:
    cnt += 1
    GetPic(web_url, cnt)

您可能感兴趣的与本文相关的镜像