import requests
from lxml import etree
# from fake_useragent import UserAgent
class Spider:
def __init__(self) -> None:
self.__file = "./goods.txt"
self.__site = "http://47.94.174.223/index.php?s=/index/goods/index/id/{}.html"
self.__maxID = 2000
# self.__ua = UserAgent()
def __getdata(self, url):
header_static = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
}
# header_random = {
# "User-Agent": self.__ua.random
# }
response = requests.get(url, headers=header_static)
if response.status_code == 200:
if "资源不存在或已被删除" in response.text:
print(f"当前url中不存在商品!{url}")
else:
html = etree.HTML(response.text)
titles = html.xpath('//h1[@class="detail-title am-margin-bottom-xs"]/text()')
title = titles[0].strip()
prices = html.xpath('//b[@class="goods-price"]/text()')
price = prices[0]
tm_count = html.xpath('//span[@class="tm-count"]/text()')
view_count = tm_count[1]
sale_count = tm_count[2]
stocks = html.xpath('//span[@class="stock"]/text()')
stock = stocks[0]
content = f"{title},{price},{view_count},{sale_count},{stock}\n"
return content
else:
print(f"页面访问失败,状态码:{response.status_code}")
def run(self):
file = open(self.__file, 'a+', encoding="utf-8")
content = ""
for num in range(1, self.__maxID + 1):
result = self.__getdata(self.__site.format(num))
if result:
content=str(num) + "," + result
file.write(content)
file.close()
if __name__ == "__main__":
spider = Spider()
spider.run()