最近刚学了爬虫,迫不及待找了个测试网站练练
为了效果更好,我加入了:
- python多线程的老模块
_thread
- 简易的UA代理池
fake_useragent
(需自己下载)
自己电脑配置也高一点,否则带不动。
爬虫代码
# -*- coding:utf-8 -*-
from urllib import request
from fake_useragent import UserAgent
import _thread
class my_spider(object): #爬虫类
def __init__(self): #构造函数
self.tick_cnt = 1 #tick计数
try:
self.Ua_Pattern = UserAgent() #构造代理池
Ua = self.Ua_Pattern.firefox #获取UA
self.url = "https://www.mianshiya.com/" # 嘿嘿嘿,测试网站
headers = {"User-Agent": Ua} #写入UA
Requests = request.Request(url=self.url, headers=headers)
Response = request.urlopen(Requests) #爬虫!
first_html = Response.read().decode('utf-8') #第一次爬的信息
print("first_html get message : ")
print(first_html)
except Exception as error_message: #问题?
print("Catch Error: " + error_message)
def run(self): #RURNRURNRUN
#消化一下……无法描述
while True:
try:
Ua = self.Ua_Pattern.firefox
headers = {"User-Agent" : Ua}
Requests = request.Request(url=self.url,headers=headers)
Response = request.urlopen(Requests)
if(Response.read().decode('utf-8') == None):
print("Device our request!")
print("Exit!")
exit(0)
print("Tick["+self.tick_cnt+"] ", end=" ")
tick_cnt += 1
Ua = self.Ua_Pattern.ie
headers = {"User-Agent" : Ua}
Requests = request.Request(url=self.url,headers=headers)
Response = request.urlopen(Requests)
if(Response.read().decode('utf-8') == None):
print("Device our request!")
print("Exit!")
exit(0)
print("Tick["+self.tick_cnt+"] ", end=" ")
tick_cnt += 1
Ua = self.Ua_Pattern.safari
headers = {"User-Agent" : Ua}
Requests = request.Request(url=self.url,headers=headers)
Response = request.urlopen(Requests)
if(Response.read().decode('utf-8') == None):
print("Device our request!")
print("Exit!")
exit(0)
print("Tick["+self.tick_cnt+"] ",end=" ")
tick_cnt += 1
Ua = self.Ua_Pattern.edge
headers = {"User-Agent" : Ua}
Requests = request.Request(url=self.url,headers=headers)
Response = request.urlopen(Requests)
if(Response.read().decode('utf-8') == None):
print("Device our request!")
print("Exit!")
exit(0)
print("Tick["+self.tick_cnt+"] ", end=" ")
tick_cnt != 1
except Exception as error_message:
print("Take Error As: ")
print(error_message)
def run_sub_request(): #创建进程的函数
s = my_spider()
s.run()
def main(): #主函数
try:
while True:
_thread.start_new_thread(run_sub_request,()) #进程!
except:
print("Error: Can't init thread")
if __name__ == '__main__': #INIT
main()
不要随意尝试,否则真的是从入门到入狱,试一次够了