import requests
import re
def getHTMLText(url):
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
"cookie": "t=925dee34e81f413e4fef7a69c7f090c0; thw=cn; enc=USKVUh4tJthiI1eCoRjSvXMMwcQCUB6Pm%2FG2e%2Bx8Zzj2R8mvHbyIRMvjw5uHSAxRJqybVuPHcpheH4rViVghGA%3D%3D; ubn=p; ucn=center; hng=CN%7Czh-CN%7CCNY%7C156; _tb_token_=7eee31ea6eeee; _m_h5_tk=4d5d14490fdce5319f8541adfdadb480_1588050782216; _m_h5_tk_enc=d614cdbac7a97458b8e2c70f4cbcd729; cookie2=1557558a4332ef81858dc21b47817056; _samesite_flag_=true; mt=ci=0_0; cna=zqE4Ft8uf1YCAW4QakxC3scY; v=0; sgcookie=EelepVTQ%2F07Mt9LFQ%2BFwN; unb=3978845940; uc1=cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D&cookie15=URm48syIIVrSKA%3D%3D&cookie21=VT5L2FSpczFp&existShop=false&pas=0&cookie14=UoTUMtAXczzAIA%3D%3D; uc3=nk2=2%2FwbQB%2BJkrPffA%3D%3D&id2=UNkweIxWG1CnGg%3D%3D&vt3=F8dBxGRxWjf%2F%2BLsn9sU%3D&lg2=UIHiLt3xD8xYTw%3D%3D; csg=fc690fa1; lgc=%5Cu9AD8%5Cu6D69%5Cu8F691539; cookie17=UNkweIxWG1CnGg%3D%3D; dnk=%5Cu9AD8%5Cu6D69%5Cu8F691539; skt=344aaa20257e02a1; existShop=MTU4ODA0MjYyOA%3D%3D; uc4=id4=0%40Ug46tyo4ob3biEziEaxPUpGo6GU%2F&nk4=0%402ZJ%2Bt%2F9NMqURgG4XCISzJus7T6v1; tracknick=%5Cu9AD8%5Cu6D69%5Cu8F691539; _cc_=Vq8l%2BKCLiw%3D%3D; _l_g_=Ug%3D%3D; sg=90a; _nk_=%5Cu9AD8%5Cu6D69%5Cu8F691539; cookie1=UNk2H5M%2FwzJuY4rqcy9QEhX%2F39S6q%2B0QNDyKQ9qvKtI%3D; tfstk=c2dPBuwtzbhr1fGvB_CFPAV0q0ZRaaRH1YQdEdPsFzE3z0BlbsvYvZ0bv3KFXjXl.; isg=BLGxa6YHS-ekROcx-ophTXt2wD1LniUQ4sSRBZPGs3i3utAM2-yN4DHb3E7cL71I; l=eBIN-eicQJu_wU0GBOfwourza77O_IRjWuPzaNbMiT5POMfw9MiGWZjoJfYeCnGVHsOyR3yAaP04B7YO6ydSnxv9-M80ACMmndC.."}
try:
r=requests.get(url,headers=headers,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
def parsePage(ilt,html):
try:
plt=re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
tlt=re.findall(r'\"raw_title\"\:\".*?\"',html)
for i in range(len(plt)):
price=eval(plt[i].split(":")[1])
title=eval(tlt[i].split(":")[1])
ilt.append([price,title])
except:
print("")
def printGoodsList(ilt):
tplt = "{:4}\t{:8}\t{:16}"
print(tplt.format("序号","价格","商品名称"))
count=0
for g in ilt:
count=count+1
print(tplt.format(count,g[0],g[1]))
print("")
def main():
goods="书包"
depth=2
start_url="https://s.taobao.com/search?q="+goods
infoList=[]
for i in range(depth):
try:
url=start_url +"&s="+str(44*i)
html=getHTMLText(url)
parsePage(infoList,html)
except:
continue
printGoodsList(infoList)
main()
爬取淘宝商品信息
最新推荐文章于 2020-10-12 16:42:11 发布