爬虫代码(TJ)

getip.py来自https://mp.youkuaiyun.com/postedit/99288836 

import getip
import re
import cx_Oracle
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pymysql
# 获取若干需爬取的网页用来测试
testurl=[
    "https://www.tujia.com/gongyu/hangzhou/1/",
    "https://www.tujia.com/gongyu/hangzhou/2/"
]
testur2=[
    "https://www.tujia.com/detail/12690196.htm",
    "https://www.tujia.com/detail/11146003.htm"
]
thisapi = 'http://ip.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=0&fa=0&fetch_key=&groupid=0&qty=1&time=100&pro=&city=&port=1&format=txt&ss=1&css=&dt=1&specialTxt=3&specialJson='
ip, ua = getip.check(0,thisapi,testurl)
conn = pymysql.connect(host='localhost', port=3306, user='root', passwd='******', db='hzdz', charset='utf8')
cur = conn.cursor()

# conn = cx_Oracle.connect('***', '***', 'localhost:1521/orcl')  # 连接数据库
# cur = conn.cursor()  # 获取cursor
count = 1671
for j in range(776,786 ):

    url = "https://www.tujia.com/gongyu/hangzhou/" + str(j) + '/'
    for i in range(0,2):
        try:
            ip, ua = getip.check(ip, thisapi,testurl)
            getip.install(ip, ua)

            data1 = urllib.request.urlopen(url).read()
            data = data1.decode("utf-8", "ignore")
            if(len(data)<3000):
                continue
            else:
                print("----当前IP有效--------")
                #print(type(data))
                pat = '<div class="label-tag">.*?<div clas
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值