网页为Ajax加载,不能直接爬。
import urllib.request
import urllib.parse
import urllib.error
import json
from bs4 import BeautifulSoup
import re
import os
from multiprocessing import Pool
def get_page(offset):
data = {
'aid': '24',
'offset': offset,
'format': 'json',
'keyword': '图集 街拍',
'autoload': 'true',
'count': '20',
'cur_tab': '1',
'from': 'search_tab',
'pd': 'synthesis',
}
url = 'https://www.toutiao.com/api/search/content/?'+urllib.parse.urlencode(data)
try:
response = urllib.request.urlopen(url)
if response.code == 200:
return response
else:
return None
except urllib.error.HTTPError:
print('请求失败')
return None
def json_parse(html):
data = json.loads(html)
if data and 'data'in data.keys():
for each in data.get('data'):
yield each.get('article_url')
def url_open(url):
response = urllib.request.Request(url)
response.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36')
html = urllib.request.urlopen(response)
return html.read()
def get_page_detail(url):
try:
response = urllib.request.Request(url)
response.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36')
html = urllib.request.urlopen(response)
if html.code == 200:
return html.read()
else:
return None
except:
print('请求url失败:',url)
return None
def parse_page(html):
url=[]
soup = BeautifulSoup(html,'lxml')
html = soup.prettify()
title = soup.find('title').string
if title:
ST = re.compile("gallery: JSON.parse\(\"(.*?)\"\),", re.S)
result = re.findall(ST,html)
if result and title:
result =result[0]
result = str(result)
result = json.loads(result.replace('\\',''))
num = 1
for each in result.get('sub_images'):
url.append(each.get('url'))
if url:
for fil in url:
filname = str(title)+str(num) + '.jpg'
with open(filname,'wb') as f:
f.write(url_open(fil))
num += 1
def main(offset):
html = get_page(offset).read()
if json_parse(html):
for url in json_parse(html):
if url and get_page_detail(url):
html = get_page_detail(url).decode('utf-8')
parse_page(html)
if __name__ == '__main__':
os.mkdir('图片')
os.chdir('图片')
groups = [x*20 for x in range(0,5)]
pool = Pool()
pool.map(main,groups)

被折叠的 条评论
为什么被折叠?



