爬取电影天堂全站电影

新博客:https://wywwzjj.top/

具体分析以后再补,静态页面也没啥好分析的。

import requests, re
from bs4 import BeautifulSoup
import xlsxwriter
import datetime
from lxml import etree

def get_URLs(URL, page):
    # URLs = []
    # html = get_html(start_url)
    # soup = BeautifulSoup(html, 'lxml')
    # urls = soup.find(id='menu')
    # pattern = re.compile(r'href="(.+)">')  # 匹配模式
    # res = re.findall(pattern, str(urls))
    # host = 'http://www.ygdy8.net/'
    # for u in res:
    #     if 'http' not in u:
    #         u = host + u
    #     URLs.append(u)
    # URLs.insert(10, URLs[-3])
    # del URLs[1]
    # url = URLs[:10]
    # html = get_html(url[0])
    # soup = BeautifulSoup(html, 'lxml')
    # page = soup.find(class_='x')
    domain = 'http://www.ygdy8.net'
    start_url = URL
    url = start_url + page + '.html'
    html = get_html(url)
    soup = BeautifulSoup(html, 'lxml')
    urls = soup.find_all(class_='ulink')
    # print(urls)
    pattern = re.compile(r'href="(.+?)">')  # 匹配模式
    res = re.findall(pattern, str(urls))
    #print(len(res))
    # print(res)
    for u in res:
        if 'index' in u:
            res.pop(res.index(u))
    # 每页落掉了两个
    urls = list(map(lambda u: domain+u, res))
    return urls

def get_html(url):
     try:
         headers = {
   
   
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
             'ContentType': 'text/html; charset=utf-8'
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值