实时分析你的网站日志v1

Python
<br />import time # # file = open('access.log') # while 1: # where = file.tell() # line = file.readline() # if not line: # time.sleep(1) # file.seek(where) # else: # print(line) import re from operator import itemgetter from pprint import pprint from ua_parser import user_agent_parser import pymongo import time def is_spider(ua): """ 'Googlebot', // Google 爬虫 'Baiduspider', // 百度爬虫 'Yahoo! Slurp', // 雅虎爬虫 'YodaoBot', // 有道爬虫 'msnbot' bingbot/// Bing爬虫 Sosospider 腾讯搜搜 Sogou 搜狗 Googlebot-Image Google图片搜索 360Spider 360 spider YandexBot 俄罗斯yandex YisouSpider :param ua: :return: """ pass def translate_time(st): # 如a = "2013-10-10 23:40:00", 想改为 # a = "2013/10/10 23:40:00" # 方法: 先转换为时间数组, 然后转换为其他格式 # 22/Jun/2018:16:06:50 # 22/Jun/2018:16:00:50 """ In [19]: a = "22/Jun/2018:16:00:50" In [20]: timeArray = time.strptime(a, "%d/%b/%Y:%H:%M:%S") In [21]: timeArray Out[21]: time.struct_time(tm_year=2013, tm_mon=10, tm_mday=10, tm_hour=23, tm_min=40, tm_sec=0, tm_wday=3, tm_yday=283, tm_isdst=-1) In [22]: time.strftime("%Y/%m/%d %H:%M:%S", timeArray) :param st: :return: """ timeArray = time.strptime(st, "%d/%b/%Y:%H:%M:%S") otherStyleTime = time.strftime("%Y/%m/%d %H:%M:%S", timeArray) # print(otherStyleTime) return otherStyleTime def inert_mongo(line_data): c = pymongo.MongoClient('localhost', 27017) c['log']['data'].insert_one(line_data) def parser_logfile(line): pattern = (r'' '(\d+.\d+.\d+.\d+)\s-\s-\s' # IP address '\[(.+)\]\s' # datetime '"GET\s(.+)\s\w+/.+"\s' # requested file '(\d+)\s' # status '(\d+)\s' # bandwidth '"(.+)"\s' # referrer '"(.+)"' # user agent ) log_data = re.findall(pattern, line) if len(log_data) >= 1: for i in log_data: ll_data={ "ip":i[0], "time_st" :translate_time(i[1].split(' ')[0]), "url" :i[2], "status":i[3], "bandwidth": i[4], "referrer" :i[5], "user_agent": user_agent_parser.ParseDevice(i[6]) if len(i[6]) >30 else i[6] } c = pymongo.MongoClient('localhost', 27017) c['log_web']['data'].insert_one(ll_data) pprint(ll_data) file = open('www.168seo.cn.log') while True: where = file.tell() line = file.readline() if not line: time.sleep(1) file.seek(where) else: # print(line) parser_logfile(line) """ def parser_logfile(logfile): pattern = (r'' '(\d+.\d+.\d+.\d+)\s-\s-\s' # IP address '\[(.+)\]\s' # datetime '"GET\s(.+)\s\w+/.+"\s' # requested file '(\d+)\s' # status '(\d+)\s' # bandwidth '"(.+)"\s' # referrer '"(.+)"' # user agent ) fi = open(logfile, 'r') url_list = [] for line in fi: log_data = re.findall(pattern, line) print(log_data) url_list.append(log_data) fi.close() return url_list def parser_urllist(url_list): urls = [] for url in url_list: for r in url: urls.append(r[5]) return urls def get_urldict(urls): d = {} for url in urls: d[url] = d.get(url, 0) + 1 return d def url_count(logfile): url_list = parser_logfile(logfile) urls = parser_urllist(url_list) totals = get_urldict(urls) return totals if __name__ == '__main__': urls_with_counts = url_count('www.168seo.cn.log') sorted_by_count = sorted(urls_with_counts.items(), key=itemgetter(1), reverse=True) print(sorted_by_count) """
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
< br / > import time
#
# file = open('access.log')
# while 1:
#     where = file.tell()
#     line = file.readline()
#     if not line:
#         time.sleep(1)
#         file.seek(where)
#     else:
#         print(line)
import re
from operator import itemgetter
from pprint import pprint
from ua_parser import user_agent_parser
import pymongo
 
import time
 
def is_spider ( ua ) :
     """
 
    'Googlebot', // Google 爬虫
    'Baiduspider', // 百度爬虫
    'Yahoo! Slurp', // 雅虎爬虫
    'YodaoBot', // 有道爬虫
    'msnbot' bingbot/// Bing爬虫
    Sosospider 腾讯搜搜
    Sogou 搜狗
    Googlebot-Image Google图片搜索
    360Spider 360 spider
    YandexBot 俄罗斯yandex
 
    YisouSpider
    :param ua:
    :return:
    """
     pass
 
 
 
def translate_time ( st ) :
     # 如a = "2013-10-10 23:40:00", 想改为
     # a = "2013/10/10 23:40:00"
     # 方法: 先转换为时间数组, 然后转换为其他格式
     # 22/Jun/2018:16:06:50
     # 22/Jun/2018:16:00:50
     """
    In [19]: a = "22/Jun/2018:16:00:50"
 
    In [20]: timeArray = time.strptime(a, "%d/%b/%Y:%H:%M:%S")
 
    In [21]: timeArray
    Out[21]: time.struct_time(tm_year=2013, tm_mon=10, tm_mday=10, tm_hour=23, tm_min=40, tm_sec=0, tm_wday=3, tm_yday=283, tm_isdst=-1)
 
    In [22]: time.strftime("%Y/%m/%d %H:%M:%S", timeArray)
    :param st:
    :return:
    """
     timeArray = time . strptime ( st , "%d/%b/%Y:%H:%M:%S" )
     otherStyleTime = time . strftime ( "%Y/%m/%d %H:%M:%S" , timeArray )
     # print(otherStyleTime)
     return otherStyleTime
 
 
def inert_mongo ( line_data ) :
     c = pymongo . MongoClient ( 'localhost' , 27017 )
     c [ 'log' ] [ 'data' ] . insert_one ( line_data )
 
 
def parser_logfile ( line ) :
     pattern = ( r ''
               '(\d+.\d+.\d+.\d+)\s-\s-\s'    # IP address
               '\[(.+)\]\s'    # datetime
               '"GET\s(.+)\s\w+/.+"\s'    # requested file
               '(\d+)\s'    # status
               '(\d+)\s'    # bandwidth
               '"(.+)"\s'    # referrer
               '"(.+)"'    # user agent
               )
     log_data = re . findall ( pattern , line )
 
     if len ( log_data ) >= 1 :
         for i in log_data :
             ll_data = {
             "ip" : i [ 0 ] ,
             "time_st" : translate_time ( i [ 1 ] . split ( ' ' ) [ 0 ] ) ,
             "url" : i [ 2 ] ,
             "status" : i [ 3 ] ,
             "bandwidth" : i [ 4 ] ,
             "referrer" : i [ 5 ] ,
             "user_agent" : user_agent_parser . ParseDevice ( i [ 6 ] ) if len ( i [ 6 ] ) > 30 else i [ 6 ]
             }
             c = pymongo . MongoClient ( 'localhost' , 27017 )
             c [ 'log_web' ] [ 'data' ] . insert_one ( ll_data )
             pprint ( ll_data )
 
 
 
 
 
 
file = open ( 'www.168seo.cn.log' )
while True :
     where = file . tell ( )
     line = file . readline ( )
     if not line :
         time . sleep ( 1 )
         file . seek ( where )
     else :
         # print(line)
         parser_logfile ( line )
 
"""
 
def parser_logfile(logfile):
    pattern = (r''
               '(\d+.\d+.\d+.\d+)\s-\s-\s'  # IP address
               '\[(.+)\]\s'  # datetime
               '"GET\s(.+)\s\w+/.+"\s'  # requested file
               '(\d+)\s'  # status
               '(\d+)\s'  # bandwidth
               '"(.+)"\s'  # referrer
               '"(.+)"'  # user agent
               )
    fi = open(logfile, 'r')
    url_list = []
    for line in fi:
        log_data = re.findall(pattern, line)
        print(log_data)
        url_list.append(log_data)
    fi.close()
    return url_list
 
 
def parser_urllist(url_list):
    urls = []
    for url in url_list:
        for r in url:
            urls.append(r[5])
    return urls
 
 
def get_urldict(urls):
    d = {}
    for url in urls:
        d[url] = d.get(url, 0) + 1
    return d
 
 
def url_count(logfile):
    url_list = parser_logfile(logfile)
    urls = parser_urllist(url_list)
    totals = get_urldict(urls)
    return totals
 
 
if __name__ == '__main__':
    urls_with_counts = url_count('www.168seo.cn.log')
    sorted_by_count = sorted(urls_with_counts.items(), key=itemgetter(1), reverse=True)
    print(sorted_by_count)
"""
 
 
 
 



  • zeropython 微信公众号 5868037 QQ号 5868037@qq.com QQ邮箱
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值