简单爬虫

基于beautifulsuop

import time
import requests
import json
import random
from bs4 import BeautifulSoup

def get_one_page(url,cook):
    headers = {
    'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Connection' : 'Keep-Alive',
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'cookie':cook
    }
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        return response.text
    return None

cooks = '''__mta=150305417.1588808901897.1595037249421.1595037405797.23;\
mojo-uuid=0f659c89788a56ae3a4164a8ab4f079c; t_lxid=171ec63e3b4c4-0e209e\
3a78b689-79657361-144000-171ec63e3b5c8-tid; _lxsdk_cuid=171ed587190c8-0\
3e96dce81b705-79657361-144000-171ed587190c8-tid; uuid_n_v=v1; uuid=81D1\
E730C89911EAA7A6E9BC9532945D723B984CDF0747D1AE88870BCF7E01E8; _csrf=cdad\
fe6eda6f526f7ac27336a69c0a0c2a174f5cbe1f79308d13badb010cb358; Hm_lvt_70\
3e94591e87be68cc8da0da7cbd0be2=1595037223; _lx_utm=utm_source%3DBaidu%2\
6utm_medium%3Dorganic; _lxsdk=81D1E730C89911EAA7A6E9BC9532945D723B984CD\
F0747D1AE88870BCF7E01E8; __mta=150305417.1588808901897.1588842614861.15\
95037226252.20; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1595037405; _\
lxsdk_s=173602ee478-64f-d26-79b%7C%7C1'''
for i in range(0,100,10):
    url = "https://maoyan.com/board/4?offset=" + str(i)
    times = random.randint(2,5)
    time.sleep(times)
    print(times)
    print(url)
    html = get_one_page(url,cooks)
    with open("C:\\Users\\西木康\\Desktop\\爬虫\\001.txt","a",encoding='utf-8')as f:
         f.write(html)
    soop = BeautifulSoup(html , 'lxml')
    text1 = soop.find_all('dd')
    for text2 in text1:
        index = text2.i.string
        name = text2.find("p",class_= "name").string
        stars = text2.find("p",class_= "star").string
        risetime = text2.find("p",class_= "releasetime").string
        text3 = index.strip() + "," + name.strip() + "," + stars.strip() + "," + risetime.strip() + "\n"
        with open("C:\\Users\\西木康\\Desktop\\爬虫\\111.txt","a",encoding='utf-8')as f:
            f.write(text3)
print("OK!")

结果

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值