#! /usr/bin/python python2.7
#-*- coding: UTF-8 -*-
# File: single_html_parser.py
# Date: 2019-5-8
# Author: guoxinian
import sys
from bs4 import BeautifulSoup
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import sys
import json
reload(sys)
sys.setdefaultencoding('utf-8')
writer = open('./data/tt2.txt','w')
reader =open('./data/donghua_filter.txt','r')
request_time_limit=500
def get_html(url):
headers = {"User-Agent": "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"}
try:
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
req = requests.get(url, headers=headers, verify=False, timeout=request_time_limit)
if req.encoding == 'ISO-8859-1':
encodings = requests.utils.get_encodings_from_content(req.text)
if encodings:
encoding = encodings[0]
else:
encoding = req.apparent_encoding
else:
encoding = req.encoding
html = req.content.decode(encoding, 'ignore').encode('utf-8', 'ignore')
except:
print("Download error:", url) # ,req.encoding,chardet.detect(req.content))
html = None
return html
def parser_html(url,title):
html = get_html(url)
#解析json
# if(html):
# data=json.loads(html)
# lists = data["data"]
# for child in lists:
# title = child["title"]
# print(title)
# writer.write(title.strip().strip("\n"))
# writer.write("\n")
# else:
# print("html null")
#html源码解析
if html:
try:
soup = BeautifulSoup(html, 'html.parser')
div1 =soup.find('div',attrs={'class':'mod_result'})
#print(div1)
ul1=div1.find('ul',attrs={'class':'mod_result_list'})
li1=ul1.find('li',attrs={'class':'list_item'})
div2 = li1.find('div',attrs={'class':'result_info result_info-auto result_info-180236'})
div3=div2.find('div',attrs={'data-tvlist-elem':'alllist'})
ul=div3.find('ul',attrs={'class':'result_album clearfix'})
lis=ul.find_all('li',attrs={'class':'album_item'})
number=1
for li in lis:
a = li.find('a')
tt=a['title']
if(tt==("第"+str(number)+"集")):
#print(title)
number+=1
# h3=div1.find('h3')
# h3_a=h3.find('a')
# a_title=h3_a.find('a')
# h3_title=a_title['title']
# #print(h3_title)
flag=False
if(number >5):
flag=True
print(title)
div4 = li1.find('div', attrs={'class': 'mod-discernList mb10 mt10'})
if(div4):
flag=True
print(title)
if flag:
writer.write(title.strip().strip("\n"))
writer.write("\n")
except:
print('parser error')
else:
print(url," html not exit")
if __name__ == '__main__':
#cates=['28983','28984','28985','28986','28987','28988','28989','28990','28991','28993','28983','30918','30919','30920','30922','28994']
#for cate in cates:
titles=reader.readlines()
for title in titles:#9979
#url="https://list.iqiyi.com/www/15/"+cate+"-------------24-"+str(index)+"-1-iqiyi--.html"
url="https://so.iqiyi.com/so/q_"+title+"?source=input&sr=623565566409"#+str(title)+"&genres=%E5%8A%A8%E7%94%BB"#+str(index)+".uc-205"#+str(index)
parser_html(url,title)
主要利用爱奇艺的全网搜索功能,判断query获得的结果中是否是一个box集合