使用了python里的requests 和lxml库,这两个库可以通过pip install requests; pip install lxml来安装。
lxml官网为:https://lxml.de/installation.html, 另一种安装lxml的方式为 apt-get install python-lxml
相应的代码为:
# -*- coding: utf-8 -*-
import os
import requests
from lxml import html
headers = {
'Host': 'docs.qed-it.com',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
# !!!注意, 请求头部里使用gzip, 响应的网页内容不一定被压缩,这得看目标网站是否压缩网页
'Accept-Encoding': 'gzip, deflate, sdch, br',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'Upgrade-Insecure-Requests': '1',
#'Authorization': 'Basic cWVkdGVzdGVyOmc1djhLUlZjdXBwNA==',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
}
cookies = dict(Cookie='__cfduid=da58e11c538b3d7981894a4763b5bb4db1539678780; _ga=GA1.2.1749017230.1539678802; _gid=GA1.2.986082824.1540429323')
#store the crawled urls
crawled_urls = []
#set up file to save the content of the page
def save(text, filename='temp', path='output'):
setup_dir(path)
fpath = os.path.join(path, filename)
with open(fpath, 'w') as f:
print('output:', fpath)
f.write(text)
f.close()
#get the completed url,e.g. http://www.xxx.com/xxx/yyy.html
def fix_url(prefix_url, url):
ind = url.rfind(u'html')
if ind != -1:
ind += 4
else:
ind = -1
url = url[0:ind]
return prefix_url+url
#for recursivec crawl, so fix the preifix url
def fix_prefix_url(url):
ind = url.rfind('/')
url = url[0:ind]
return url + '/'
#get the save path of the file
def get_path(suffix_url):
ind = suffix_url.rfind('/')
if ind != -1:
path = suffix_url[0:ind]
else:
path = ''
return path
#get the file name from the pointed url
def fix_filename(suffix_url):
ind0 = suffix_url.rfind('/')
if ind0 != -1:
ind0 += 1
else:
ind0 = 0
ind1 = suffix_url.rfind('html')
if ind1 != -1:
ind1 += 4
else:
ind1 = -1
filename = suffix_url[ind0:ind1]
return filename
#just crawl the html page
def is_html(url):
ind = url.rfind('html')
if ind == -1:
return False
return True
#recursive crawl
#note:just crawl the pointed url with <a class="reference internal" />
def crawl(prefix_url, url, current_path):
#just crawl the html page
if not is_html(url):
return
#for recursive crawl, we should fix something each time, like the save path, prefix url e.g.
file_path = current_path + '/' + get_path(url)
file_name = fix_filename(url)
target_url = fix_url(prefix_url, url)
prefix_url = fix_prefix_url(target_url)
#judge the url if has been crawled or not
if target_url in crawled_urls:
print target_url + " has crawled"
return
#so we remove the crawled url by judge if the file has existed or not
if os.path.exists(os.path.join(file_path, file_name)):
print target_url + " has_crawled"
return
print "will crawl " + target_url
crawled_urls.append(target_url)
#visit the url
resp = requests.get(target_url, cookies=cookies, auth=('qedtester', 'g5v8KRVcupp4'), verify=False)
#get the content of the page from response
#page = resp.content
page = resp.text
#note: the encode format
page = page.encode('utf-8')
#save page to file_path/file_name
save(page, file_name, file_path)
#parse page to get hyperlinks
html_tree = html.fromstring(page)
#use lxml to extract the hyperlinks
target_urls = html_tree.xpath(u'//a[@class="reference internal"]')
print "size of target_urls: " + str(len(target_urls))
for target_url in target_urls:
#file_name = target_url.text
suffix_url = target_url.attrib['href']
#print suffix_url.encode('utf-8')
#recursive crawl
crawl(prefix_url, suffix_url, file_path)
def setup_dir(dir_name):
isExist = os.path.exists(dir_name)
if not isExist:
print "mkdir", dir_name
os.makedirs(dir_name)
else:
print "%s existed" % dir_name
if __name__ == '__main__':
print "Crawl is running..."
prefix_url = 'https://docs.qed-it.com/docs/sdk-docs/en/latest/'
#the firsh page to crawl
url = 'index.html'
current_path = os.getcwd()
crawl(prefix_url, url, current_path)
print "Crawl comes back."