猫眼电影榜单爬虫实战-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_47654347/article/details/107427386

爬虫日记

- 基于beautifulsuop
- 结果

基于beautifulsuop

import time
import requests
import json
import random
from bs4 import BeautifulSoup

def get_one_page(url,cook):
    headers = {
    'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Connection' : 'Keep-Alive',
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'cookie':cook
    }
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        return response.text
    return None

cooks = '''__mta=150305417.1588808901897.1595037249421.1595037405797.23;\
mojo-uuid=0f659c89788a56ae3a4164a8ab4f079c; t_lxid=171ec63e3b4c4-0e209e\
3a78b689-79657361-144000-171ec63e3b5c8-tid; _lxsdk_cuid=171ed587190c8-0\
3e96dce81b705-79657361-144000-171ed587190c8-tid; uuid_n_v=v1; uuid=81D1\
E730C89911EAA7A6E9BC9532945D723B984CDF0747D1AE88870BCF7E01E8; _csrf=cdad\
fe6eda6f526f7ac27336a69c0a0c2a174f5cbe1f79308d13badb010cb358; Hm_lvt_70\
3e94591e87be68cc8da0da7cbd0be2=1595037223; _lx_utm=utm_source%3DBaidu%2\
6utm_medium%3Dorganic; _lxsdk=81D1E730C89911EAA7A6E9BC9532945D723B984CD\
F0747D1AE88870BCF7E01E8; __mta=150305417.1588808901897.1588842614861.15\
95037226252.20; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1595037405; _\
lxsdk_s=173602ee478-64f-d26-79b%7C%7C1'''
for i in range(0,100,10):
    url = "https://maoyan.com/board/4?offset=" + str(i)
    times = random.randint(2,5)
    time.sleep(times)
    print(times)
    print(url)
    html = get_one_page(url,cooks)
    with open("C:\\Users\\西木康\\Desktop\\爬虫\\001.txt","a",encoding='utf-8')as f:
         f.write(html)
    soop = BeautifulSoup(html , 'lxml')
    text1 = soop.find_all('dd')
    for text2 in text1:
        index = text2.i.string
        name = text2.find("p",class_= "name").string
        stars = text2.find("p",class_= "star").string
        risetime = text2.find("p",class_= "releasetime").string
        text3 = index.strip() + "," + name.strip() + "," + stars.strip() + "," + risetime.strip() + "\n"
        with open("C:\\Users\\西木康\\Desktop\\爬虫\\111.txt","a",encoding='utf-8')as f:
            f.write(text3)
print("OK!")