以下是一个实现您需求的Python脚本示例,该脚本从MySQL数据库中提取数据并通过Socket端口10050发送,供Flume收集:
import socket
import pymysql
import json
import time
class MySQLSocketServer:
def __init__(self, host='0.0.0.0', port=10050):
# Socket服务器配置
self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.server_socket.bind((host, port))
self.server_socket.listen(5) # 允许最大排队连接数
# MySQL数据库配置(根据实际情况修改)
self.mysql_config = {
'host': 'localhost',
'user': 'root',
'password': 'your_password',
'db': 'data_realtime',
'charset': 'utf8mb4',
'cursorclass': pymysql.cursors.DictCursor
}
# 跟踪最后处理的ID(假设表中有自增id字段)
self.last_id = 0
def _get_new_data(self):
"""从MySQL获取新增数据"""
try:
connection = pymysql.connect(**self.mysql_config)
with connection.cursor() as cursor:
# 查询新数据(根据你的表结构调整SQL)
sql = "SELECT * FROM your_table WHERE id > %s ORDER BY id ASC"
cursor.execute(sql, (self.last_id,))
results = cursor.fetchall()
if results:
self.last_id = results[-1]['id'] # 更新最后处理的ID
return results
except Exception as e:
print(f"Database error: {e}")
return []
finally:
if 'connection' in locals() and connection:
connection.close()
def _send_data(self, client_socket, data):
"""发送数据到客户端"""
try:
for record in data:
# 转换为JSON格式并发送
json_data = json.dumps(record) + '\n'
client_socket.sendall(json_data.encode('utf-8'))
except Exception as e:
print(f"发送数据失败: {e}")
def start(self):
print(f"Socket服务器已启动,正在监听端口 10050...")
while True:
client_sock, addr = self.server_socket.accept()
print(f"接收到来自 {addr} 的连接")
try:
while True:
# 获取新数据
new_data = self._get_new_data()
if new_data:
self._send_data(client_sock, new_data)
else:
time.sleep(1) # 没有新数据时暂停1秒
except ConnectionResetError:
print("客户端连接已断开")
finally:
client_sock.close()
if __name__ == "__main__":
server = MySQLSocketServer()
server.start()
使用说明
- 环境准备:
# 安装依赖
pip install pymysql
- 配置调整:
- 修改
<font style="color:rgb(64, 64, 64);">mysql_config</font>
中的数据库连接信息 - 根据实际表结构调整SQL查询语句(特别是
<font style="color:rgb(64, 64, 64);">id</font>
字段和表名) - 如果使用时间戳跟踪新数据,修改查询逻辑为:
sql = "SELECT * FROM your_table WHERE update_time > %s ORDER BY update_time ASC"
- 运行脚本:
python socket_server.py
- Flume配置示例(flume-conf.properties):
# 定义组件
agent.sources = netcat-source
agent.channels = memory-channel
agent.sinks = kafka-sink
# 配置Source
agent.sources.netcat-source.type = netcat
agent.sources.netcat-source.bind = 0.0.0.0
agent.sources.netcat-source.port = 10050
agent.sources.netcat-source.channels = memory-channel
# 配置Channel
agent.channels.memory-channel.type = memory
agent.channels.memory-channel.capacity = 1000
# 配置Kafka Sink
agent.sinks.kafka-sink.type = org.apache.flume.sink.kafka.KafkaSink
agent.sinks.kafka-sink.kafka.topic = your_topic
agent.sinks.kafka-sink.kafka.bootstrap.servers = localhost:9092
agent.sinks.kafka-sink.channel = memory-channel
关键点说明
- 数据增量获取:
- 通过记录最后处理的
<font style="color:rgb(64, 64, 64);">id</font>
实现增量查询 - 每次只获取比上次
<font style="color:rgb(64, 64, 64);">id</font>
大的新数据 - 如果使用时间戳字段,需要相应修改查询条件
- 通过记录最后处理的
- 数据格式:
- 使用JSON格式序列化数据
- 每条数据末尾添加换行符(
<font style="color:rgb(64, 64, 64);">\n</font>
),符合Flume的Line-based协议
- 错误处理:
- 自动处理客户端断开连接
- 数据库连接错误会打印日志并继续运行
- 发送失败会跳过当前数据继续发送