getip.py来自https://mp.youkuaiyun.com/postedit/99288836
import getip
import re
import cx_Oracle
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pymysql
# 获取若干需爬取的网页用来测试
testurl=[
"https://www.tujia.com/gongyu/hangzhou/1/",
"https://www.tujia.com/gongyu/hangzhou/2/"
]
testur2=[
"https://www.tujia.com/detail/12690196.htm",
"https://www.tujia.com/detail/11146003.htm"
]
thisapi = 'http://ip.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=0&fa=0&fetch_key=&groupid=0&qty=1&time=100&pro=&city=&port=1&format=txt&ss=1&css=&dt=1&specialTxt=3&specialJson='
ip, ua = getip.check(0,thisapi,testurl)
conn = pymysql.connect(host='localhost', port=3306, user='root', passwd='******', db='hzdz', charset='utf8')
cur = conn.cursor()
# conn = cx_Oracle.connect('***', '***', 'localhost:1521/orcl') # 连接数据库
# cur = conn.cursor() # 获取cursor
count = 1671
for j in range(776,786 ):
url = "https://www.tujia.com/gongyu/hangzhou/" + str(j) + '/'
for i in range(0,2):
try:
ip, ua = getip.check(ip, thisapi,testurl)
getip.install(ip, ua)
data1 = urllib.request.urlopen(url).read()
data = data1.decode("utf-8", "ignore")
if(len(data)<3000):
continue
else:
print("----当前IP有效--------")
#print(type(data))
pat = '<div class="label-tag">.*?<div clas