import re
from parse_url import parse_url
import requests
import time
import threading
from queue import Queue
start = time.time()
class Neihan_Spider(object):
def __init__(self):
self.url_temp = "http://www.budejie.com/text/{}"
self.headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"}
self.url_queue = Queue()
self.html_queue = Queue()
self.content_queue = Queue()
def get_url_list(self):
for i in range(1, 30):
self.url_queue.put(self.url_temp.format(i))
def parse_url(self):
while True:
url = self.url_queue.get()
print(url)
response = requests.get(url, headers=self.headers)
self.html_queue.put(response.content.decode())
self.url_queue.task_done()
def get_content_list(self):
while True:
html_str = self.html_queue.get()
p = re.compile(
r"""<div class="j-r-list-c-desc">\n \n \n <a href="/detail-\d+.html">(.*?)</a>""",
re.S)
duanzi_str = p.findall(html_str)
self.content_queue.put(duanzi_str)
self.html_queue.task_done()
def save_data(self):
while True:
s = self.content_queue.get()
with open("neihan.txt", 'a') as f:
for s1 in s:
p2 = re.compile(r"<br />")
s2 = p2.sub(r"\n", s1)
f.write(s2 + "\n\n")
self.content_queue.task_done()
def run(self):
thread_list = []
t_url = threading.Thread(target=self.get_url_list)
thread_list.append(t_url)
for i in range(3):
t_parse = threading.Thread(target=self.parse_url)
thread_list.append(t_parse)
for i in range(2):
t_content = threading.Thread(target=self.get_content_list)
thread_list.append(t_content)
t_save = threading.Thread(target=self.save_data)
thread_list.append(t_save)
for t in thread_list:
t.setDaemon(True)
t.start()
for q in [self.url_queue, self.html_queue, self.content_queue]:
q.join()
print("主线程结束")
if __name__ == '__main__':
nei_han = Neihan_Spider()
nei_han.run()
end = time.time()
print(end - start)