基于beautifulsuop
import time
import requests
import json
import random
from bs4 import BeautifulSoup
def get_one_page(url,cook):
headers = {
'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0',
'Content-Type': 'application/x-www-form-urlencoded',
'Connection' : 'Keep-Alive',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'cookie':cook
}
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return None
cooks = '''__mta=150305417.1588808901897.1595037249421.1595037405797.23;\
mojo-uuid=0f659c89788a56ae3a4164a8ab4f079c; t_lxid=171ec63e3b4c4-0e209e\
3a78b689-79657361-144000-171ec63e3b5c8-tid; _lxsdk_cuid=171ed587190c8-0\
3e96dce81b705-79657361-144000-171ed587190c8-tid; uuid_n_v=v1; uuid=81D1\
E730C89911EAA7A6E9BC9532945D723B984CDF0747D1AE88870BCF7E01E8; _csrf=cdad\
fe6eda6f526f7ac27336a69c0a0c2a174f5cbe1f79308d13badb010cb358; Hm_lvt_70\
3e94591e87be68cc8da0da7cbd0be2=1595037223; _lx_utm=utm_source%3DBaidu%2\
6utm_medium%3Dorganic; _lxsdk=81D1E730C89911EAA7A6E9BC9532945D723B984CD\
F0747D1AE88870BCF7E01E8; __mta=150305417.1588808901897.1588842614861.15\
95037226252.20; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1595037405; _\
lxsdk_s=173602ee478-64f-d26-79b%7C%7C1'''
for i in range(0,100,10):
url = "https://maoyan.com/board/4?offset=" + str(i)
times = random.randint(2,5)
time.sleep(times)
print(times)
print(url)
html = get_one_page(url,cooks)
with open("C:\\Users\\西木康\\Desktop\\爬虫\\001.txt","a",encoding='utf-8')as f:
f.write(html)
soop = BeautifulSoup(html , 'lxml')
text1 = soop.find_all('dd')
for text2 in text1:
index = text2.i.string
name = text2.find("p",class_= "name").string
stars = text2.find("p",class_= "star").string
risetime = text2.find("p",class_= "releasetime").string
text3 = index.strip() + "," + name.strip() + "," + stars.strip() + "," + risetime.strip() + "\n"
with open("C:\\Users\\西木康\\Desktop\\爬虫\\111.txt","a",encoding='utf-8')as f:
f.write(text3)
print("OK!")
结果
