注意:依据每个关键词的不同,正则表达式可能需要需改。本例中正则表达式,仅对应于本例中关键词。
import requests,re
def getHTMLText(url):
try:
r = requests.get(url,timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ''
def parserHTML(infoList,html):
try:
plt = re.findall(r'\"price\"\:\"[\d\.]*\"',html)
tlt = re.findall(r'\"title\"\:\".*?\"',html)
for i in range(len(plt)):
price = eval(plt[i].split(':')[1])
title = eval(tlt[i].split(':')[1])
infoList.append([price , title])
except:
print("")
def printInfo(infoList):
tplt = '{:4}\t{:6}\t{:18}'
print(tplt.format('序号','价格','名称'))
count = 1
for il in infoList:
print(tplt.format(count,il[0],il[1]))
count += 1
def main():
deep = 5
keyword = '笔记本电脑'
url = 'https://s.taobao.com/search?q=' + keyword
infoList = []
for i in range(deep):
try:
tempUrl = url + '&s=' + str(44*i)
html = getHTMLText(tempUrl)
parserHTML(infoList,html)
except:
continue
printInfo(infoList)
main()
2323

被折叠的 条评论
为什么被折叠?



