爬虫 头条

网页为Ajax加载,不能直接爬。

import urllib.request
import urllib.parse
import urllib.error
import json
from bs4 import BeautifulSoup
import re
import os
from multiprocessing  import Pool
def get_page(offset):
     data = {
          'aid': '24',
          'offset': offset,
          'format': 'json',
          'keyword': '图集 街拍',
          'autoload': 'true',
          'count': '20',
          'cur_tab': '1',
          'from': 'search_tab',
          'pd': 'synthesis',
     }
     url = 'https://www.toutiao.com/api/search/content/?'+urllib.parse.urlencode(data)
     try:
          response = urllib.request.urlopen(url)
          if response.code == 200:
               return response
          else:
               return None
     except urllib.error.HTTPError:
          print('请求失败')
          return None
     
def json_parse(html):
     data = json.loads(html)
     if data and 'data'in data.keys():
          for each in data.get('data'):
               yield each.get('article_url')
               
def url_open(url):
     response = urllib.request.Request(url)
     response.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36')
     html = urllib.request.urlopen(response)
     return html.read()

def get_page_detail(url):
     try:
          response = urllib.request.Request(url)
          response.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36')
          html = urllib.request.urlopen(response)
          if html.code == 200:
               return html.read()
          else:
               return None
     except:
          print('请求url失败:',url)
          return None
     
def parse_page(html):
     url=[]
     soup = BeautifulSoup(html,'lxml')
     html = soup.prettify()
     title = soup.find('title').string
     if title:
          ST = re.compile("gallery: JSON.parse\(\"(.*?)\"\),", re.S)
          result = re.findall(ST,html)
          if result and title:
               result =result[0]
               result = str(result)
               result = json.loads(result.replace('\\',''))
               num = 1
               for each in result.get('sub_images'):
                    url.append(each.get('url'))
               if url:
                    for fil in url:
                         filname = str(title)+str(num) + '.jpg'
                         with open(filname,'wb') as f:
                              f.write(url_open(fil))
                              num += 1

     
def main(offset):
     html = get_page(offset).read()
     if json_parse(html):
          for url in json_parse(html):
               if url and get_page_detail(url):
                    html =  get_page_detail(url).decode('utf-8')
                    parse_page(html)
          
if __name__ == '__main__':
     os.mkdir('图片')
     os.chdir('图片')
     groups = [x*20 for x in range(0,5)]
     pool = Pool()
     pool.map(main,groups)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值