import os
import re
import subprocess
import mysql.connector
from pyhive import hive
from datetime import datetime, timedelta
import logging
from concurrent.futures import ThreadPoolExecutor
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# 配置信息
MYSQL_CONFIG = {
'host': '10.14.101.113',
'port': 30001,
'user': 'grafana',
'password': '',
'database': 'apex_log',
'charset': 'utf8'
}
HIVE_CONFIG = {
'host': '10.14.50.142',
'port': 7001,
'username': 'hive',
'database': 'dim_game_wd'
}
LOG_BASE_PATH = 'log' # 日志根目录
TEMP_FILE_PATH = 'log_tmp' # 临时文件存放目录
def create_temp_dir():
"""创建临时目录"""
os.makedirs(TEMP_FILE_PATH, exist_ok=True)
def get_previous_date():
"""获取前一天的日期"""
prev_day = datetime.now() - timedelta(days=8)
return prev_day.strftime("%Y%m%d"), prev_day.strftime("%Y-%m-%d")
def fetch_filter_contents():
"""从MySQL获取过滤内容并生成pattern文件"""
try:
conn = mysql.connector.connect(**MYSQL_CONFIG)
cursor = conn.cursor()
cursor.execute("SELECT filter_content FROM apex_filter_content WHERE need_filter=1")
filter_contents = [row[0] for row in cursor.fetchall()]
if not filter_contents:
logger.warning("没有需要过滤的内容,跳过后续处理")
return 0
# 写入UTF-8文件并转换为GB18030
utf8_file = os.path.join(TEMP_FILE_PATH, 'pattern.utf8')
gb_file = os.path.join(TEMP_FILE_PATH, 'pattern.gb')
with open(utf8_file, 'w', encoding='utf-8') as f:
f.write('\n'.join(filter_contents))
subprocess.run(f'iconv -f UTF-8 -t GB18030 {utf8_file} > {gb_file}', shell=True)
logger.info(f"生成过滤模式文件,包含 {len(filter_contents)} 条规则")
return len(filter_contents)
except mysql.connector.Error as err:
logger.error(f"MySQL错误: {err}")
return 0
finally:
if conn.is_connected():
cursor.close()
conn.close()
def run_grep_search(log_date_str):
"""执行grep命令搜索日志"""
try:
gb_file = os.path.join(TEMP_FILE_PATH, 'pattern.gb')
result_file = os.path.join(TEMP_FILE_PATH, 'filter_results.log')
# 构建grep命令
cmd = f"grep -a -F -f {gb_file} {LOG_BASE_PATH}/*/svr*/*{log_date_str}.txt > {result_file}"
logger.info(f"执行命令: {cmd}")
subprocess.run(cmd, shell=True)
logger.info("日志过滤完成")
return result_file
except Exception as e:
logger.error(f"执行grep命令出错: {e}")
return None
def parse_log_line(line):
"""解析单行日志,提取所需信息"""
# 定义正则表达式模式
dist_id_pattern = re.compile(rf'{re.escape(LOG_BASE_PATH)}/(\d+)/svr\d+/')
account_gid_pattern = re.compile(r'\[.*?\((.*?)/(.*?)\)\]')
# 匹配区组ID
dist_match = dist_id_pattern.search(line)
if not dist_match:
return None
dist_id = dist_match.group(1)
# 匹配账号和GID
account_gid_match = account_gid_pattern.search(line)
if not account_gid_match:
return None
account, gid = account_gid_match.groups()
# 匹配过滤内容
with open(os.path.join(TEMP_FILE_PATH, 'pattern.utf8'), 'r', encoding='utf-8') as f:
filter_contents = [content.strip() for content in f.readlines()]
filter_match = None
for content in filter_contents:
if content in line:
filter_match = content
break
if not filter_match:
return None
return {
'dist_id': dist_id,
'account': account,
'gid': gid,
'filter_content': filter_match
}
def process_log_results(file_path, log_date):
"""处理日志结果文件"""
results = []
seen = set()
try:
with open(file_path, 'r', encoding='gb18030', errors='ignore') as f:
# 使用线程池并行处理日志行
with ThreadPoolExecutor(max_workers=8) as executor:
future_to_line = {executor.submit(parse_log_line, line): line for line in f}
for future in future_to_line:
result = future.result()
if result:
# 使用复合键去重 (dist_id, account, gid, filter_content)
key = (result['dist_id'], result['account'], result['gid'], result['filter_content'])
if key not in seen:
seen.add(key)
result['log_date'] = log_date
results.append(result)
#print(results)
logger.info(f"解析出 {len(results)} 条有效记录")
return results
except Exception as e:
logger.error(f"处理日志文件出错: {e}")
return []
def query_hive_data(parsed_data, log_date):
"""从Hive查询玩家数据"""
if not parsed_data:
logger.warning("没有解析数据,跳过Hive查询")
return []
try:
conn = hive.Connection(**HIVE_CONFIG)
cursor = conn.cursor()
# 构建IN子句的值列表
dist_ids = set()
accounts = set()
gids = set()
for item in parsed_data:
dist_ids.add(item['dist_id'])
accounts.add(f"'{item['account']}'")
gids.add(f"'{item['gid']}'")
# 构建查询语句
query = f"""
SELECT
t.dist_id,
sc.sub_name,
t.filter_content,
t.account,
t.gid,
COALESCE(ci.level, 0) AS level,
COALESCE(ci.zhanli_lv, 0) AS zhanli_lv,
COALESCE(a.gold_coin, 0) AS gold_coin
FROM (
SELECT
CAST(dist_id AS INT) AS dist_id,
account,
gid,
filter_content
FROM (VALUES {','.join([f"('{item['dist_id']}','{item['account']}','{item['gid']}','{item['filter_content']}')" for item in parsed_data])})
AS t(dist_id, account, gid, filter_content)
) t
LEFT JOIN sub_category sc ON t.dist_id = sc.dist_id
LEFT JOIN account a ON t.dist_id = a.pt_dist AND t.account = a.account
LEFT JOIN char_info ci ON t.dist_id = ci.pt_dist AND t.gid = ci.gid
"""
logger.info("执行Hive查询...")
print(query)
'''
cursor.execute(query)
results = []
for row in cursor.fetchall():
results.append({
'log_date': log_date,
'sub_name': row[1] or 'Unknown',
'dist_id': row[0],
'filter_content': row[2],
'account': row[3],
'gid': row[4],
'level': row[5],
'zhanli_lv': row[6],
'gold_coin': row[7]
})
logger.info(f"从Hive查询到 {len(results)} 条玩家数据")
return results
'''
except Exception as e:
logger.error(f"Hive查询出错: {e}")
return []
finally:
if conn:
cursor.close()
conn.close()
def save_to_player_data(data):
"""保存最终结果到apex_player_data表"""
if not data:
logger.warning("没有数据需要保存")
return 0
try:
conn = mysql.connector.connect(**MYSQL_CONFIG)
cursor = conn.cursor()
insert_sql = """
INSERT INTO apex_player_data (
log_date, sub_name, dist_id, filter_content,
account, gid, level, zhanli_lv, gold_coin
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
# 批量插入数据
batch_size = 100
total_rows = len(data)
for i in range(0, total_rows, batch_size):
batch = data[i:i+batch_size]
values = [
(item['log_date'], item['sub_name'], item['dist_id'], item['filter_content'],
item['account'], item['gid'], item['level'], item['zhanli_lv'], item['gold_coin'])
for item in batch
]
cursor.executemany(insert_sql, values)
conn.commit()
logger.info(f"已保存 {min(i+batch_size, total_rows)}/{total_rows} 条记录")
logger.info(f"成功保存 {total_rows} 条记录到apex_player_data表")
return total_rows
except mysql.connector.Error as err:
logger.error(f"MySQL错误: {err}")
return 0
finally:
if conn.is_connected():
cursor.close()
conn.close()
def cleanup_temp_files():
"""清理临时文件"""
try:
files_to_remove = [
os.path.join(TEMP_FILE_PATH, 'pattern.utf8'),
os.path.join(TEMP_FILE_PATH, 'pattern.gb'),
os.path.join(TEMP_FILE_PATH, 'filter_results.log')
]
for file_path in files_to_remove:
if os.path.exists(file_path):
os.remove(file_path)
logger.info(f"已删除临时文件: {file_path}")
except Exception as e:
logger.error(f"清理临时文件出错: {e}")
def main():
logger.info("开始执行日志过滤任务")
create_temp_dir()
# 步骤1: 获取日期
log_date_str, log_date_dash = get_previous_date()
logger.info(f"处理日期: {log_date_dash}")
# 步骤2: 获取过滤内容
filter_count = fetch_filter_contents()
if filter_count == 0:
cleanup_temp_files()
return
# 步骤3: 执行grep搜索
result_file = run_grep_search(log_date_str)
if not result_file:
cleanup_temp_files()
return
# 步骤4: 解析结果
parsed_data = process_log_results(result_file, log_date_dash)
if not parsed_data:
cleanup_temp_files()
logger.info("没有匹配的日志记录,任务结束")
return
# 步骤5: 查询Hive获取玩家数据
hive_data = query_hive_data(parsed_data, log_date_dash)
# 步骤6: 存入最终表
if hive_data:
save_to_player_data(hive_data)
# 清理临时文件
cleanup_temp_files()
logger.info("任务执行完成")
if __name__ == "__main__":
main()
-------
这个脚本改一下,循环log目录下每个区组id,然后执行过滤操作,输出文件格式为log_tmp/{区组id}_{日志日期}.log
然后再进行账号、gid、过滤内容(filter_content)的匹配,循环区组id,作为where条件去查hive库(单次 查一个区组),将返回结果入mysql库apex_player_data表,
根据区组id、account、gid、查询账号对应的战力等级、角色等级、金元宝数量:
dim_game_wd. sub_category
通过dist_id,查询sub_name区组名称
dim_game_wd.account 通过pt_dist=dist_id,account=account,查询gold_coin(金元宝)
dim_game_wd. char_info pt_dist=dist_id,gid=gid,查询level、zhanli_lv
最终返回过滤内容:filter_content,对应的apex_player_data表中所有字段
最新发布