最近项目需要,通过访问日志来确认每秒并发量,处理时间超60ms的量,每小时处理量之类的数据,故花了点小时间用python分析access日志来得到数据报表,切入正题就是代码,简单快速,一天十几个G的access日志文件,在几分钟内可以得到相应的报表
#coding=utf-8
import os
import xlwt
import time
FILE_NAME = "/alidata1/wwwlogs/rtb_2017/03/access-rtb_20170311.log"
#FILE_NAME = "access-rtb.log"
#日志格式:'$remote_addr - $remote_user [$time_local][$request_time] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent" $http_x_forwarded_for'
#100.109.192.22 - - [09/Mar/2017:00:00:16 +0800][0.002] "POST /d_iqiyi HTTP/1.0" 204 0 "-" "-" 123.125.118.42
time_second_statistic = {}
time_min_statistic = {}
time_hour_statistic = {}
remote_ip_statistic = {}
proccess_time_statistic = {}
platform_statistic = {}
def from_this_dir(filename):
return os.path.join(os.path.dirname(os.path.abspath(__file__)), filename)
def time_out_check(proccess_time):
if proccess_time == "" or proccess_time == "0":
return 0
if 60 < (float(proccess_time) * 1000):
return 1
return 0
file_seek_index = 0
del_index = 0
old_time_hour = ""
old_time_day = ""
file_name = ""
time_hour = ""
time_day = ""
line_index = 0
cell_index = 0
file_handle = None
wbk = None
second_sheet = None
min_sheet = None
hour_sheet = None
remoteip_sheet = None
proccess_sheet = None
platform_sheet = None
quit_flag = False
file_name_time_str = ""
while(not quit_flag):
if file_handle == None:
file_handle = open(FILE_NAME)
file_handle.seek(file_seek_index)
line = file_handle.readline()
file_seek_index = file_handle.tell()
if line == '':
quit_flag = True
time_hour = ""
time_day = ""
if quit_flag == False:
line = line.strip('\n')
strs = line.split(' ')
time_second = strs[3][1:]
file_name_time_str = time_second
time_min = time_second[0:len(time_second)-3]
time_hour = time_min[0:len(time_min)-3]
time_day = time_hour[0:len(time_hour) - 3]
proccess_time = strs[4][7:12]
remote_ip = strs[