Python 中的多线程通过 threading
模块实现。由于全局解释器锁(GIL)的存在,Python 的多线程更适合 I/O 密集型任务而非 CPU 密集型任务。
多线程类似于同时执行多个不同程序,多线程运行有如下优点:
-
使用线程可以把占据长时间的程序中的任务放到后台去处理。
-
用户界面可以更加吸引人,比如用户点击了一个按钮去触发某些事件的处理,可以弹出一个进度条来显示处理的进度。
-
程序的运行速度可能加快。
-
在一些等待的任务实现上如用户输入、文件读写和网络收发数据等,线程就比较有用了。在这种情况下我们可以释放一些珍贵的资源如内存占用等等。
每个独立的线程有一个程序运行的入口、顺序执行序列和程序的出口。但是线程不能够独立执行,必须依存在应用程序中,由应用程序提供多个线程执行控制。
每个线程都有他自己的一组CPU寄存器,称为线程的上下文,该上下文反映了线程上次运行该线程的CPU寄存器的状态。
指令指针和堆栈指针寄存器是线程上下文中两个最重要的寄存器,线程总是在进程得到上下文中运行的,这些地址都用于标志拥有线程的进程地址空间中的内存。
-
线程可以被抢占(中断)。
-
在其他线程正在运行时,线程可以暂时搁置(也称为睡眠) -- 这就是线程的退让。
线程可以分为:
-
内核线程:由操作系统内核创建和撤销。
-
用户线程:不需要内核支持而在用户程序中实现的线程。
目录
1. 基本用法
1.1 创建线程
import threading
import time
def worker(num):
print(f"线程 {num} 开始")
time.sleep(2) # 模拟耗时操作
print(f"线程 {num} 结束")
# 创建线程
threads = []
for i in range(5):
t = threading.Thread(target=worker, args=(i,))
threads.append(t)
t.start()
# 等待所有线程完成
for t in threads:
t.join()
print("所有线程执行完毕")
1.2 继承 Thread 类
import threading
import time
class MyThread(threading.Thread):
def __init__(self, num):
threading.Thread.__init__(self)
self.num = num
def run(self):
print(f"自定义线程 {self.num} 开始")
time.sleep(1)
print(f"自定义线程 {self.num} 结束")
# 使用自定义线程类
threads = []
for i in range(3):
t = MyThread(i)
threads.append(t)
t.start()
for t in threads:
t.join()
2. 线程同步
2.1 使用 Lock (锁)
import threading
import time
shared_value = 0
lock = threading.Lock()
def increment():
global shared_value
for _ in range(100000):
lock.acquire()
shared_value += 1
lock.release()
threads = []
for _ in range(5):
t = threading.Thread(target=increment)
threads.append(t)
t.start()
for t in threads:
t.join()
print(f"最终共享值: {shared_value}") # 应该是500000
2.2 使用 RLock (可重入锁)
import threading
import time
rlock = threading.RLock()
def recursive_func(count):
if count <= 0:
return
rlock.acquire()
print(f"获取锁,count = {count}")
recursive_func(count - 1)
rlock.release()
t = threading.Thread(target=recursive_func, args=(5,))
t.start()
t.join()
2.3 使用 Condition (条件变量)
import threading
import time
condition = threading.Condition()
items = []
def consumer():
condition.acquire()
if not items:
print("消费者等待...")
condition.wait()
print(f"消费者消费: {items.pop()}")
condition.release()
def producer():
condition.acquire()
items.append("产品")
print("生产者生产")
condition.notify()
condition.release()
threads = []
c = threading.Thread(target=consumer)
p = threading.Thread(target=producer)
# 先启动消费者,它会等待
c.start()
time.sleep(1)
# 然后启动生产者
p.start()
c.join()
p.join()
2.4 使用 Semaphore (信号量)
import threading
import time
semaphore = threading.Semaphore(3) # 最多允许3个线程同时访问
def access_resource(thread_id):
print(f"线程 {thread_id} 等待访问资源")
with semaphore:
print(f"线程 {thread_id} 获得访问权限")
time.sleep(2)
print(f"线程 {thread_id} 释放访问权限")
threads = []
for i in range(10):
t = threading.Thread(target=access_resource, args=(i,))
threads.append(t)
t.start()
for t in threads:
t.join()
3. 线程优先级队列( Queue)
Queue 模块中的常用方法:
- Queue.qsize() 返回队列的大小
- Queue.empty() 如果队列为空,返回True,反之False
- Queue.full() 如果队列满了,返回True,反之False
- Queue.full 与 maxsize 大小对应
- Queue.get([block[, timeout]])获取队列,timeout等待时间
- Queue.get_nowait() 相当Queue.get(False)
- Queue.put(item) 写入队列,timeout等待时间
- Queue.put_nowait(item) 相当Queue.put(item, False)
- Queue.task_done() 在完成一项工作之后,Queue.task_done()函数向任务已经完成的队列发送一个信号
- Queue.join() 实际上意味着等到队列为空,再执行别的操作
import threading
import time
import queue
def producer(q, items):
for item in items:
print(f"生产: {item}")
q.put(item)
time.sleep(0.1)
q.put(None) # 发送结束信号
def consumer(q):
while True:
item = q.get()
if item is None: # 收到结束信号
break
print(f"消费: {item}")
q.task_done()
q = queue.Queue()
producer_thread = threading.Thread(target=producer, args=(q, ["A", "B", "C", "D"]))
consumer_thread = threading.Thread(target=consumer, args=(q,))
producer_thread.start()
consumer_thread.start()
producer_thread.join()
consumer_thread.join()
print("生产消费完成")
4. 线程池
Python 3.2+ 提供了 concurrent.futures
模块,可以更方便地使用线程池:
from concurrent.futures import ThreadPoolExecutor
import urllib.request
def fetch_url(url):
with urllib.request.urlopen(url) as response:
return response.read()
urls = [
'https://www.python.org',
'https://www.baidu.com',
'https://www.hao123.com'
]
with ThreadPoolExecutor(max_workers=3) as executor:
results = executor.map(fetch_url, urls)
for url, content in zip(urls, results):
print(f"{url} 页面长度: {len(content)}")
5. 注意事项
-
GIL 限制:由于全局解释器锁(GIL),Python 多线程不适合 CPU 密集型任务,可以考虑使用多进程(
multiprocessing
模块)。 -
线程安全:对共享数据的访问需要使用锁或其他同步机制。
-
避免死锁:确保获取锁的顺序一致,避免循环等待。
-
守护线程:设置
thread.daemon = True
可以使线程在主线程退出时自动结束。 -
线程局部数据:使用
threading.local()
创建线程特定的数据存储。
import threading
import time
local_data = threading.local()
def show_data():
print(threading.current_thread().name, local_data.value)
def thread_func(value):
local_data.value = value
show_data()
threads = []
for i in range(3):
t = threading.Thread(target=thread_func, args=(i,))
threads.append(t)
t.start()
for t in threads:
t.join()
6. 多线程实用实例
下面提供几个实用的多线程示例,涵盖常见的使用场景。
6.1 多线程下载文件
import threading
import urllib.request
import os
from queue import Queue
class DownloadThread(threading.Thread):
def __init__(self, queue, download_dir):
threading.Thread.__init__(self)
self.queue = queue
self.download_dir = download_dir
if not os.path.exists(download_dir):
os.makedirs(download_dir)
def run(self):
while True:
url = self.queue.get()
try:
self.download_file(url)
except Exception as e:
print(f"下载 {url} 失败: {e}")
finally:
self.queue.task_done()
def download_file(self, url):
file_name = url.split('/')[-1]
file_path = os.path.join(self.download_dir, file_name)
print(f"开始下载: {file_name}")
urllib.request.urlretrieve(url, file_path)
print(f"完成下载: {file_name}")
def download_manager(url_list, num_threads=4, download_dir='downloads'):
queue = Queue()
# 创建线程池
for _ in range(num_threads):
t = DownloadThread(queue, download_dir)
t.daemon = True
t.start()
# 添加下载任务到队列
for url in url_list:
queue.put(url)
# 等待所有任务完成
queue.join()
print("所有下载任务完成")
# 使用示例
if __name__ == '__main__':
urls = [
'https://www.python.org/static/img/python-logo.png',
'https://www.google.com/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png',
'https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png',
'https://upload.wikimedia.org/wikipedia/commons/3/3d/LARGE_elevation.jpg'
]
download_manager(urls, num_threads=3)
6.2 多线程网页爬虫
import threading
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from queue import Queue
class CrawlerThread(threading.Thread):
def __init__(self, queue, visited, lock, max_depth=2):
threading.Thread.__init__(self)
self.queue = queue
self.visited = visited
self.lock = lock
self.max_depth = max_depth
def run(self):
while True:
url, depth = self.queue.get()
if depth > self.max_depth:
self.queue.task_done()
continue
try:
self.crawl(url, depth)
except Exception as e:
print(f"爬取 {url} 失败: {e}")
finally:
self.queue.task_done()
def crawl(self, url, depth):
# 检查是否已访问过
with self.lock:
if url in self.visited:
return
self.visited.add(url)
print(f"爬取: {url} (深度: {depth})")
try:
response = requests.get(url, timeout=5)
soup = BeautifulSoup(response.text, 'html.parser')
# 处理当前页面内容
self.process_page(url, soup)
# 提取并添加新链接到队列
if depth < self.max_depth:
for link in soup.find_all('a', href=True):
absolute_url = urljoin(url, link['href'])
if absolute_url.startswith('http'):
self.queue.put((absolute_url, depth + 1))
except requests.RequestException as e:
print(f"请求 {url} 出错: {e}")
def process_page(self, url, soup):
# 这里可以添加处理页面内容的逻辑
title = soup.title.string if soup.title else '无标题'
print(f"处理页面: {title} ({url})")
def start_crawler(start_url, num_threads=4, max_depth=2):
queue = Queue()
visited = set()
lock = threading.Lock()
# 创建爬虫线程
for _ in range(num_threads):
t = CrawlerThread(queue, visited, lock, max_depth)
t.daemon = True
t.start()
# 添加初始URL
queue.put((start_url, 0))
# 等待所有任务完成
queue.join()
print(f"爬取完成,共访问了 {len(visited)} 个页面")
# 使用示例
if __name__ == '__main__':
start_crawler('https://www.python.org', num_threads=3, max_depth=1)
6.3 多线程处理大量数据
import threading
import time
import random
from queue import Queue
class DataProcessor:
def __init__(self, data_chunks, num_threads=4):
self.data_chunks = data_chunks
self.num_threads = num_threads
self.queue = Queue()
self.results = []
self.lock = threading.Lock()
def process_chunk(self, chunk):
# 模拟耗时处理
time.sleep(random.uniform(0.1, 0.5))
# 这里可以是实际的数据处理逻辑
result = sum(x ** 2 for x in chunk)
return result
def worker(self):
while True:
chunk = self.queue.get()
if chunk is None: # 结束信号
self.queue.task_done()
break
result = self.process_chunk(chunk)
with self.lock:
self.results.append(result)
print(f"处理完成: 结果={result} (剩余任务: {self.queue.qsize()})")
self.queue.task_done()
def run(self):
# 启动工作线程
threads = []
for _ in range(self.num_threads):
t = threading.Thread(target=self.worker)
t.start()
threads.append(t)
# 添加数据块到队列
for chunk in self.data_chunks:
self.queue.put(chunk)
# 添加结束信号
for _ in range(self.num_threads):
self.queue.put(None)
# 等待所有任务完成
self.queue.join()
# 汇总结果
total = sum(self.results)
print(f"所有数据处理完成,最终结果: {total}")
return total
# 使用示例
if __name__ == '__main__':
# 生成测试数据 (100个数据块,每个包含10个随机数)
data = [[random.randint(1, 100) for _ in range(10)] for _ in range(100)]
processor = DataProcessor(data, num_threads=5)
processor.run()
6.4 多线程定时任务调度器
import threading
import time
from datetime import datetime
class ScheduledTask:
def __init__(self, interval, task_func, *args, **kwargs):
self.interval = interval # 执行间隔(秒)
self.task_func = task_func
self.args = args
self.kwargs = kwargs
self._stop_event = threading.Event()
def start(self):
thread = threading.Thread(target=self._run)
thread.daemon = True
thread.start()
def _run(self):
next_time = time.time()
while not self._stop_event.is_set():
next_time += self.interval
self.task_func(*self.args, **self.kwargs)
# 计算需要睡眠的时间
sleep_time = next_time - time.time()
if sleep_time > 0:
self._stop_event.wait(sleep_time)
def stop(self):
self._stop_event.set()
def print_time(name):
now = datetime.now().strftime("%H:%M:%S")
print(f"[{now}] 任务 {name} 执行")
# 使用示例
if __name__ == '__main__':
print("启动定时任务调度器 (按Ctrl+C停止)")
# 创建并启动多个定时任务
task1 = ScheduledTask(2, print_time, "A")
task2 = ScheduledTask(3, print_time, "B")
task3 = ScheduledTask(5, print_time, "C")
task1.start()
task2.start()
task3.start()
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
print("\n停止所有任务...")
task1.stop()
task2.stop()
task3.stop()
6.5 多线程端口扫描器
import threading
import socket
from queue import Queue
class PortScanner:
def __init__(self, target, start_port=1, end_port=1024, timeout=1.0):
self.target = target
self.start_port = start_port
self.end_port = end_port
self.timeout = timeout
self.queue = Queue()
self.open_ports = []
self.lock = threading.Lock()
def scan_port(self, port):
try:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(self.timeout)
result = s.connect_ex((self.target, port))
if result == 0:
with self.lock:
self.open_ports.append(port)
print(f"端口 {port} 开放")
except Exception as e:
print(f"扫描端口 {port} 出错: {e}")
def worker(self):
while True:
port = self.queue.get()
if port is None: # 结束信号
self.queue.task_done()
break
self.scan_port(port)
self.queue.task_done()
def run(self, num_threads=20):
# 添加端口到队列
for port in range(self.start_port, self.end_port + 1):
self.queue.put(port)
# 添加结束信号
for _ in range(num_threads):
self.queue.put(None)
# 启动工作线程
threads = []
for _ in range(num_threads):
t = threading.Thread(target=self.worker)
t.start()
threads.append(t)
# 等待所有任务完成
self.queue.join()
# 打印结果
print("\n扫描完成,开放端口:")
for port in sorted(self.open_ports):
print(f"端口 {port} 开放")
return self.open_ports
# 使用示例
if __name__ == '__main__':
target = input("输入要扫描的目标IP或域名: ")
start_port = int(input("输入起始端口(默认1): ") or 1)
end_port = int(input("输入结束端口(默认1024): ") or 1024)
scanner = PortScanner(target, start_port, end_port)
scanner.run(num_threads=50)