本例多个网站的网址保存在”D:\test.xlsx”文件中第一列,代码运行结束后第二列导出站内链接,链接名保存在第三列,以下代码已经在Python27中调试通过:
#coding=utf-8
import requests
import os
from bs4 import BeautifulSoup
from openpyxl import Workbook
from openpyxl import load_workbook
from requests import exceptions
def get_html(url):
'''
headers = {
'User-Agent': 'Mozilla/5.0(Macintosh; Intel Mac OS X 10_11_4)\
AppleWebKit/537.36(KHTML, like Gecko) Chrome/52 .0.2743. 116 Safari/537.36'
} #模拟浏览器访问
'''
headers = {
'User-Agent': 'Mozilla/5.0(Windows NT 10.0; WOW64; Trident/7.0; rv:11.0)\
like Gecko'
} # 模拟浏览器访问
t1=30
try:
r = requests.get(url,timeout=t1, headers = headers) #请求访问网站
r.raise_for_status()
r.encoding = r.apparent_encoding
#print(r.status_code)
if r.status_code == 200:
return r.text