import re,requests,os,json,time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
dcap = dict(DesiredCapabilities.PHANTOMJS)
with open('d://headers1.txt','r',errors='ignore') as f:
h={}
for line in f.read().split('\n'):
name,v=line.split(':',1)
h[name]=v
dcap["phantomjs.page.settings"] = (h)
driver=webdriver.PhantomJS()
url='https://list.tmall.com/search_product.htm?q=手机'
total_path='d://淘宝'
if not os.path.isdir(total_path):
os.mkdir(total_path)
def get(url):
headers=h
print(url)
s=requests.Session()
data=s.get(url,headers=h).content
return data
def parser(url):
data=get(url).decode('gbk')
bs=BeautifulSoup(data,'lxml')
items=bs.find_all("div","product")
n=1
for item in items:
print(n)
n+=1
shop_info={}
info=item.text.split('\n')
info=list(filter(lambda x:len(x)>0,info))
if len(info)==4:
shop_info['price'],shop_info['name'],shop_info['shop_name'],shop_info['月成交笔数']=info
elif len(info)==3:
shop_info['price'],shop_info['name'],shop_info['shop_name']=info
elif len(info)==5:
shop_info['price'],shop_info['name'],shop_info['two_name'],shop_info['shop_name'],shop_info['月成交笔数']=info
else:
continue
try:
shop_link='https:'+item.find("div","productTitle productTitle-spu").a['href']
except:
continue
shop_info['链接']=shop_link
shop_info['itemid']=int(re.findall(r"\?id=(.*?)\&",shop_link)[0])
item_path=total_path+'//'+str(shop_info['itemid'])
if os.path.isdir(item_path):
print('已存在')
continue
get_shop(shop_info,shop_link,item_path)
print('______下一个物品——————')
def save(info,item_path):
if not os.path.isdir(item_path):
os.mkdir(item_path)
if isinstance(info ,dict):
title=re.sub(r'\/|\\|\*|\>|\<|\?|\:|\"|\|','',str(info['name']))
t=item_path+'//'+title+'.txt'
with open(t,'a',errors='replace') as w:
w.write(str(info))
print('写入')
else:
pass
def get_shop(shop_info,shop_link,item_path):
driver.get(shop_link)
print(shop_link)
data=driver.page_source
bs=BeautifulSoup(data,'lxml')
bs=bs.find("div","tb-wrap")
shop_info['月销量']=bs.find("div","tm-indcon").text
shop_info['累计评价']=bs.find_all("div","tm-indcon")[1].text
save(shop_info,item_path)
print('——————已保存物品信息________')
num=shop_info['累计评价'][4:]
num=int(num)//10
p_links=['https://rate.tmall.com/list_detail_rate.htm?itemId={0}&spuId=566150620&sellerId=2616970884&order=3¤tPage={1}&append=0&content=1&tagId=&posi=&picture='.format(shop_info['itemid'],i) for i in range(1,num)]
p_info={}
list_p=[]
for p_link in p_links:
try:
item_p={}
item_p['name']='评论'
data=get(p_link)
data=data.decode('gbk' )
n=data.find('{')
m=data.rfind('}')
p=json.loads(data[n:m+1])
for i in p['rateList']:
item_p['评论时间']=i['rateDate']
item_p['物品类型']=i['auctionSku']
item_p['评论']=i['rateContent']
save(item_p,item_path)
print('-----评论------')
except:
continue
parser(url)
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
dcap = dict(DesiredCapabilities.PHANTOMJS)
with open('d://headers1.txt','r',errors='ignore') as f:
h={}
for line in f.read().split('\n'):
name,v=line.split(':',1)
h[name]=v
dcap["phantomjs.page.settings"] = (h)
driver=webdriver.PhantomJS()
url='https://list.tmall.com/search_product.htm?q=手机'
total_path='d://淘宝'
if not os.path.isdir(total_path):
os.mkdir(total_path)
def get(url):
headers=h
print(url)
s=requests.Session()
data=s.get(url,headers=h).content
return data
def parser(url):
data=get(url).decode('gbk')
bs=BeautifulSoup(data,'lxml')
items=bs.find_all("div","product")
n=1
for item in items:
print(n)
n+=1
shop_info={}
info=item.text.split('\n')
info=list(filter(lambda x:len(x)>0,info))
if len(info)==4:
shop_info['price'],shop_info['name'],shop_info['shop_name'],shop_info['月成交笔数']=info
elif len(info)==3:
shop_info['price'],shop_info['name'],shop_info['shop_name']=info
elif len(info)==5:
shop_info['price'],shop_info['name'],shop_info['two_name'],shop_info['shop_name'],shop_info['月成交笔数']=info
else:
continue
try:
shop_link='https:'+item.find("div","productTitle productTitle-spu").a['href']
except:
continue
shop_info['链接']=shop_link
shop_info['itemid']=int(re.findall(r"\?id=(.*?)\&",shop_link)[0])
item_path=total_path+'//'+str(shop_info['itemid'])
if os.path.isdir(item_path):
print('已存在')
continue
get_shop(shop_info,shop_link,item_path)
print('______下一个物品——————')
def save(info,item_path):
if not os.path.isdir(item_path):
os.mkdir(item_path)
if isinstance(info ,dict):
title=re.sub(r'\/|\\|\*|\>|\<|\?|\:|\"|\|','',str(info['name']))
t=item_path+'//'+title+'.txt'
with open(t,'a',errors='replace') as w:
w.write(str(info))
print('写入')
else:
pass
def get_shop(shop_info,shop_link,item_path):
driver.get(shop_link)
print(shop_link)
data=driver.page_source
bs=BeautifulSoup(data,'lxml')
bs=bs.find("div","tb-wrap")
shop_info['月销量']=bs.find("div","tm-indcon").text
shop_info['累计评价']=bs.find_all("div","tm-indcon")[1].text
save(shop_info,item_path)
print('——————已保存物品信息________')
num=shop_info['累计评价'][4:]
num=int(num)//10
p_links=['https://rate.tmall.com/list_detail_rate.htm?itemId={0}&spuId=566150620&sellerId=2616970884&order=3¤tPage={1}&append=0&content=1&tagId=&posi=&picture='.format(shop_info['itemid'],i) for i in range(1,num)]
p_info={}
list_p=[]
for p_link in p_links:
try:
item_p={}
item_p['name']='评论'
data=get(p_link)
data=data.decode('gbk' )
n=data.find('{')
m=data.rfind('}')
p=json.loads(data[n:m+1])
for i in p['rateList']:
item_p['评论时间']=i['rateDate']
item_p['物品类型']=i['auctionSku']
item_p['评论']=i['rateContent']
save(item_p,item_path)
print('-----评论------')
except:
continue
parser(url)