from bs4 import BeautifulSoup
from urllib import request
import pandas as pd
import time
from datetime import datetime
head={}
head['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
#如需使用cookie登录,将cookie地址传入head即可
#如:
"""header = {"Content-Type":"application/json",
"Host":"ts.21cn.com",
"Connection": "keep-alive",
"Accept":" */*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Referer": "https://ts.21cn.com/merchant/show/id/7037",
"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
"Cookie":"你的cookie地址,按F12在network》XHR》cookie" }
"""
jts_url_list=[]
for i in range(1,11):
jts_url_list.append("https://ts.21cn.com/front/api/ranking/merchantPostList.do?pageNo="+str(i)+"&merchantId=7037&listType=1&offset=b911ff99c1c81c99")
output=pd.DataFrame()
for jts_url in jts_url_list:
jts_req=request.Request(url=jts_url,headers=head)
jts_respon=request.urlopen(jts_req)
jts_html=jts_respon.read().decode('utf-8','ignore')
jts_dict=eval(jts_html)
jts_ctime_list=[]
jts_id_list=[]
jts_picture_list=[]
jts_title_list=[]
jts_shortTopic_list=[]
jts_shuqiu_list=[]
jts_tail_url_list=[]
output2=pd.DataFrame()
for jts_url_num in range(10):
jts_ctime_list.append(jts_dict.get('postList')[jts_url_num].get('ctime'))
jts_id_list.append(jts_dict.get('postList')[jts_url_num].get('id'))
jts_picture_list.append(jts_dict.get('postList')[jts_url_num].get('picture'))
jts_title_list.append(jts_dict.get('postList')[jts_url_num].get('title'))
jts_shortTopic_list.append(jts_dict.get('postList')[jts_url_num].get('shortTopic'))
jts_shuqiu_list.append(jts_dict.get('postList')[jts_url_num].get('shuqiu'))
jts_tail_url_list.append("https://ts.21cn.com/tousu/show/id/"+str(jts_dict.get('postList')[jts_url_num].get('id')))
output2 = pd.DataFrame({'ctime':jts_ctime_list,
'id':jts_id_list,
'picture':jts_picture_list,
'title':jts_title_list,
'shortTopic':jts_shortTopic_list,
'shuqiu':jts_shuqiu_list,
'tail_url':jts_tail_url_list})
output=pd.concat([output,output2],ignore_index=True)
time.sleep(2)
print(str(jts_url)+"完成")
def stamp_to_datetime(stamp):
"""
将时间戳(1539100800)转换为 datetime2018-10-09 16:00:00格式并返回
:param stamp:
:return:
"""
time_stamp_array = datetime.utcfromtimestamp(stamp)
date_time = time_stamp_array.strftime("%Y-%m-%d %H:%M:%S")
# 如果直接返回 date_time则为字符串格式2018-10-09 16:00:00
date = datetime.strptime(date_time,"%Y-%m-%d %H:%M:%S")
return date
output['ctime']=output.ctime.apply(lambda txt:stamp_to_datetime(txt))
output.to_excel(r'爬虫结果.xlsx',index=False)