#-*- coding:utf-8 -*-
import http import random import urllib import urllib.request as request from urllib.error import URLError import time def get_ip_list_local(file_name): with open(file_name,mode='r',encoding='utf-8') as f: return f.readlines() def load_web_content(product_url,isagent,agent_ip): print('http://' + agent_ip) req = request.Request(product_url) req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36') if isagent: try: proxy_handler = urllib.request.ProxyHandler({'http': 'http://' + agent_ip, 'https':'http://' + agent_ip}) opener = urllib.request.build_opener(proxy_handler) data = opener.open(req) dt_re = data.read().decode('utf-8') data.close() return dt_re except TimeoutError as e: print('TimeoutError code: ', e) return None except ConnectionResetError as e: print('ConnectionResetError code: ', e) return None except URLError as e: print('URLError code: ', e) print('Reason: ', e.reason) return None except http.client.IncompleteRead as e: print('IncompleteRead : ', e) return None except http.client.RemoteDisconnected as e: print('RemoteDisconnected : ', e) return None else: r = urllib.request.urlopen(req) return r.read().decode('utf-8') def process_spider(agent_ips, product_url): while True: content = load_web_content(product_url, True, random.choice(agent_ips)) print(content) #睡眠interval_time 秒后进行下一次内容抓取 interval_time = random.randint(15, 60) time.sleep(interval_time) print(str(interval_time)+'s 已过,即将进行下一次内容抓取。。。。。。') if __name__ == '__main__': file_name = '.\agent_ip\\enable_agent_ip.txt' agent_ips = get_ip_list_local(file_name) #每隔30~80s爬取一个产品网页内容,并存入本地 #每个系列产品隔10~30s进行爬取操作 product_url = 'url' process_spider(agent_ips, product_url) print('spider loads product content\'s game over!')
代理IP,enable_agent_ip.txt:
61.143.38.53:8118 211.127.160.240:8080 125.165.2.211:8080 182.30.224.180:80 210.245.26.140:3128 185.13.228.124:1009 46.101.92.212:80 59.111.80.139:80 195.178.56.32:8080 103.227.60.210:8080 190.248.158.194:8080 50.245.168.108:8080 96.9.69.210:53281 186.193.186.3:8080 125.165.2.211:8080 197.210.230.5:80 85.204.234.251:8080 218.202.122.221:53281 37.26.86.57:8080 187.44.182.194:8080 167.205.6.6:80 152.169.134.125:9999 141.105.162.190:8080 190.248.136.229:53281 104.41.51.173:8080 91.221.103.183:8080 13.126.69.46:80 118.102.1.114:8081 122.53.59.194:80 187.5.218.25:53281 91.193.128.76:8080 211.41.163.99:3128 204.11.243.70:3128 104.223.72.199:3128 180.211.115.155:808 143.208.9.42:8080 94.177.199.78:3128 101.53.136.123:8080 191.241.36.156:3128 40.85.184.189:8080 201.249.88.225:80 114.199.118.186:8080 101.53.136.123:8080 86.120.79.89:3128 131.255.153.171:3128 117.239.66.73:80 177.53.56.208:8080 40.85.184.189:8080 203.189.142.23:53281 59.50.68.34:53281 37.17.177.197:3128 139.196.13.42:80 202.62.9.187:8080 83.143.26.70:53281 186.193.30.101:3128 87.229.54.42:8080 202.83.162.214:8080 187.87.48.62:8080 47.74.44.92:3128 36.67.97.223:8080 103.242.239.161:65103 192.241.134.233:3128 68.171.65.230:8081 89.31.44.108:3128 180.211.91.130:8080 177.5.28.2:8080 200.114.97.14:53281 187.20.97.68:55555 52.63.138.194:8080 43.245.119.106:8080 103.195.24.81:51552 118.179.151.172:8080 118.97.29.203:8080 192.99.55.120:3128 185.76.147.151:3128