import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import csv
import os
class Class_Util_Spider():
def __init__(self):
pass
# 获取子链接
def get_child_links(self,base_url):
"""
功能:获取url界面上的所有子链接
输入:
url:网址
输出:
folder_links:列表,每个元素是一个子url地址
"""
child_links = []
# 获取网页源码
html_text = requests.request("GET",base_url).text
_links_ = re.findall('<a href="(.*?)/">',html_text)
for _link_ in _links_:
link = base_url + "/" + _link_
child_links.append(link)
return child_links
# 获取孙链接
def get_grandson_links_names(self,url_path_list,text_path):
"""
功能:解析url界面,获取python包的名称与下载路径
参数:
url_path_list:列表,每个元素是一个网页地址,对应于清华镜像的一个python包文件夹
text_path:txt文件的路径,用于存储python包的名字和下载路径,以逗号分隔
输出:
text 文件,每行包含两个元素[name,link],
-name:python包的名字
-link:python包的完全下载路径
"""
file = open(text_path,"w")
for i,url_path in enumerate(url_path_list):
print(f"{i}/{len(url_path_list)}",url_path)
html_text = requests.request("GET",url=url_path).text # 源码解析
package_links = re.findall('<a href="../../(.*?)"',html_text)# 找到链接
package_names = re.findall('>(.*?)</a>',html_text) # 找到文件名
# 将链接和文件名写入csv文件
for link,name in zip(package_links,package_names):
link = "https://pypi.tuna.tsinghua.edu.cn/" + link # 链接拼接完整
file.write(str(name +",,,"+link+"\n"))# 写入
file.close()
# 从txt读取文件,转为列表,列表中的每个元素是一个元素对。
def read_txt_2_list(self,txt_path):
assert os.path.exists(txt_path)
txt_file = open(txt_path,"r")
data_list = []
for data in txt_file.readlines():
data = data.strip("\n")
data_list.append(data)
txt_file.close()
return data_list
# 下载文件
def download_file(self,save_dir,save_name,download_address,save_dir_his):
assert os.path.exists(save_dir)
save_path = os.path.join(save_dir,save_name)
save_path_old = os.path.join(save_dir_his,save_name)
if ((not os.path.exists(save_path)) and (not os.path.exists(save_path_old))):
save_content = requests.get(download_address).content
with open(save_path,"wb") as f:
f.write(save_content)
print(f"Success downloaded {save_name}")
else:
print(f"{save_name} Existed!")
# def get_links(self,url,csv_path):
# links = []
# csv_file = open(csv_path,"w")
# csv_writer = csv.writer(csv_file)
# # 获取网页源码
# html_text = requests.request("GET",url).text
# _links_ = re.findall('<a href="(.*?)/">',html_text)
# for _link_ in _links_:
# link = url + "/" + _link_
# links.append(link)
# csv_writer.writerow([link])
# csv_file.close()
# return links
# def get_package_links_names(self,url_path_list,csv_path):
# """
# 功能:解析url界面,获取python包的名称与下载路径
# 参数:
# url_path_list:列表,每个元素是一个网页地址,对应于清华镜像的一个python包文件夹
# csv_path:csv文件的路径,用于存储python包的名字和下载路径
# 输出:
# csv 文件,每行包含两个元素[name,link],
# -name:python包的名字
# -link:python包的完全下载路径
# """
# csv_file = open(csv_path,"w")
# writer = csv.writer(csv_file)
# writer.writerow(["name","link"])
# for i,url_path in enumerate(url_path_list):
# print(f"{i}/{len(url_path_list)}",url_path)
# html_text = requests.request("GET",url=url_path).text # 源码解析
# package_links = re.findall('<a href="../../(.*?)"',html_text)# 找到链接
# package_names = re.findall('>(.*?)</a>',html_text) # 找到文件名
# # 将链接和文件名写入csv文件
# for link,name in zip(package_links,package_names):
# link = "https://pypi.tuna.tsinghua.edu.cn/" + link # 链接拼接完整
# writer.writerow([name,link])# 写入
# csv_file.close()
if __name__ == "__main__":
import requests # 导入网络请求模块
# 头部信息
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/72.0.3626.121 Safari/537.36'}
proxy = {'http': 'http://117.88.176.38:3000',
'https': 'https://117.88.176.38:3000'} # 设置代理ip与对应的端口号
try:
# 对需要爬取的网页发送请求,verify=False不验证服务器的SSL证书
response = requests.get('http://2020.ip138.com', \
headers= headers,\
proxies=proxy,\
verify=False,\
timeout=3)
print(response.status_code) # 打印响应状态码
except Exception as e:
print('错误异常信息为:',e) # 打印异常信息
常用轮子:爬虫基本功能
最新推荐文章于 2025-12-26 17:10:42 发布
898

被折叠的 条评论
为什么被折叠?



