<br />import time # # file = open('access.log') # while 1: # where = file.tell() # line = file.readline() # if not line: # time.sleep(1) # file.seek(where) # else: # print(line) import re from operator import itemgetter from pprint import pprint from ua_parser import user_agent_parser import pymongo import time def is_spider(ua): """ 'Googlebot', // Google 爬虫 'Baiduspider', // 百度爬虫 'Yahoo! Slurp', // 雅虎爬虫 'YodaoBot', // 有道爬虫 'msnbot' bingbot/// Bing爬虫 Sosospider 腾讯搜搜 Sogou 搜狗 Googlebot-Image Google图片搜索 360Spider 360 spider YandexBot 俄罗斯yandex YisouSpider :param ua: :return: """ pass def translate_time(st): # 如a = "2013-10-10 23:40:00", 想改为 # a = "2013/10/10 23:40:00" # 方法: 先转换为时间数组, 然后转换为其他格式 # 22/Jun/2018:16:06:50 # 22/Jun/2018:16:00:50 """ In [19]: a = "22/Jun/2018:16:00:50" In [20]: timeArray = time.strptime(a, "%d/%b/%Y:%H:%M:%S") In [21]: timeArray Out[21]: time.struct_time(tm_year=2013, tm_mon=10, tm_mday=10, tm_hour=23, tm_min=40, tm_sec=0, tm_wday=3, tm_yday=283, tm_isdst=-1) In [22]: time.strftime("%Y/%m/%d %H:%M:%S", timeArray) :param st: :return: """ timeArray = time.strptime(st, "%d/%b/%Y:%H:%M:%S") otherStyleTime = time.strftime("%Y/%m/%d %H:%M:%S", timeArray) # print(otherStyleTime) return otherStyleTime def inert_mongo(line_data): c = pymongo.MongoClient('localhost', 27017) c['log']['data'].insert_one(line_data) def parser_logfile(line): pattern = (r'' '(\d+.\d+.\d+.\d+)\s-\s-\s' # IP address '\[(.+)\]\s' # datetime '"GET\s(.+)\s\w+/.+"\s' # requested file '(\d+)\s' # status '(\d+)\s' # bandwidth '"(.+)"\s' # referrer '"(.+)"' # user agent ) log_data = re.findall(pattern, line) if len(log_data) >= 1: for i in log_data: ll_data={ "ip":i[0], "time_st" :translate_time(i[1].split(' ')[0]), "url" :i[2], "status":i[3], "bandwidth": i[4], "referrer" :i[5], "user_agent": user_agent_parser.ParseDevice(i[6]) if len(i[6]) >30 else i[6] } c = pymongo.MongoClient('localhost', 27017) c['log_web']['data'].insert_one(ll_data) pprint(ll_data) file = open('www.168seo.cn.log') while True: where = file.tell() line = file.readline() if not line: time.sleep(1) file.seek(where) else: # print(line) parser_logfile(line) """ def parser_logfile(logfile): pattern = (r'' '(\d+.\d+.\d+.\d+)\s-\s-\s' # IP address '\[(.+)\]\s' # datetime '"GET\s(.+)\s\w+/.+"\s' # requested file '(\d+)\s' # status '(\d+)\s' # bandwidth '"(.+)"\s' # referrer '"(.+)"' # user agent ) fi = open(logfile, 'r') url_list = [] for line in fi: log_data = re.findall(pattern, line) print(log_data) url_list.append(log_data) fi.close() return url_list def parser_urllist(url_list): urls = [] for url in url_list: for r in url: urls.append(r[5]) return urls def get_urldict(urls): d = {} for url in urls: d[url] = d.get(url, 0) + 1 return d def url_count(logfile): url_list = parser_logfile(logfile) urls = parser_urllist(url_list) totals = get_urldict(urls) return totals if __name__ == '__main__': urls_with_counts = url_count('www.168seo.cn.log') sorted_by_count = sorted(urls_with_counts.items(), key=itemgetter(1), reverse=True) print(sorted_by_count) """
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
|
<
br
/
>
import
time
#
# file = open('access.log')
# while 1:
# where = file.tell()
# line = file.readline()
# if not line:
# time.sleep(1)
# file.seek(where)
# else:
# print(line)
import
re
from
operator
import
itemgetter
from
pprint
import
pprint
from
ua_parser
import
user_agent_parser
import
pymongo
import
time
def
is_spider
(
ua
)
:
"""
'Googlebot', // Google 爬虫
'Baiduspider', // 百度爬虫
'Yahoo! Slurp', // 雅虎爬虫
'YodaoBot', // 有道爬虫
'msnbot' bingbot/// Bing爬虫
Sosospider 腾讯搜搜
Sogou 搜狗
Googlebot-Image Google图片搜索
360Spider 360 spider
YandexBot 俄罗斯yandex
YisouSpider
:param ua:
:return:
"""
pass
def
translate_time
(
st
)
:
# 如a = "2013-10-10 23:40:00", 想改为
# a = "2013/10/10 23:40:00"
# 方法: 先转换为时间数组, 然后转换为其他格式
# 22/Jun/2018:16:06:50
# 22/Jun/2018:16:00:50
"""
In [19]: a = "22/Jun/2018:16:00:50"
In [20]: timeArray = time.strptime(a, "%d/%b/%Y:%H:%M:%S")
In [21]: timeArray
Out[21]: time.struct_time(tm_year=2013, tm_mon=10, tm_mday=10, tm_hour=23, tm_min=40, tm_sec=0, tm_wday=3, tm_yday=283, tm_isdst=-1)
In [22]: time.strftime("%Y/%m/%d %H:%M:%S", timeArray)
:param st:
:return:
"""
timeArray
=
time
.
strptime
(
st
,
"%d/%b/%Y:%H:%M:%S"
)
otherStyleTime
=
time
.
strftime
(
"%Y/%m/%d %H:%M:%S"
,
timeArray
)
# print(otherStyleTime)
return
otherStyleTime
def
inert_mongo
(
line_data
)
:
c
=
pymongo
.
MongoClient
(
'localhost'
,
27017
)
c
[
'log'
]
[
'data'
]
.
insert_one
(
line_data
)
def
parser_logfile
(
line
)
:
pattern
=
(
r
''
'(\d+.\d+.\d+.\d+)\s-\s-\s'
# IP address
'\[(.+)\]\s'
# datetime
'"GET\s(.+)\s\w+/.+"\s'
# requested file
'(\d+)\s'
# status
'(\d+)\s'
# bandwidth
'"(.+)"\s'
# referrer
'"(.+)"'
# user agent
)
log_data
=
re
.
findall
(
pattern
,
line
)
if
len
(
log_data
)
>=
1
:
for
i
in
log_data
:
ll_data
=
{
"ip"
:
i
[
0
]
,
"time_st"
:
translate_time
(
i
[
1
]
.
split
(
' '
)
[
0
]
)
,
"url"
:
i
[
2
]
,
"status"
:
i
[
3
]
,
"bandwidth"
:
i
[
4
]
,
"referrer"
:
i
[
5
]
,
"user_agent"
:
user_agent_parser
.
ParseDevice
(
i
[
6
]
)
if
len
(
i
[
6
]
)
>
30
else
i
[
6
]
}
c
=
pymongo
.
MongoClient
(
'localhost'
,
27017
)
c
[
'log_web'
]
[
'data'
]
.
insert_one
(
ll_data
)
pprint
(
ll_data
)
file
=
open
(
'www.168seo.cn.log'
)
while
True
:
where
=
file
.
tell
(
)
line
=
file
.
readline
(
)
if
not
line
:
time
.
sleep
(
1
)
file
.
seek
(
where
)
else
:
# print(line)
parser_logfile
(
line
)
"""
def parser_logfile(logfile):
pattern = (r''
'(\d+.\d+.\d+.\d+)\s-\s-\s' # IP address
'\[(.+)\]\s' # datetime
'"GET\s(.+)\s\w+/.+"\s' # requested file
'(\d+)\s' # status
'(\d+)\s' # bandwidth
'"(.+)"\s' # referrer
'"(.+)"' # user agent
)
fi = open(logfile, 'r')
url_list = []
for line in fi:
log_data = re.findall(pattern, line)
print(log_data)
url_list.append(log_data)
fi.close()
return url_list
def parser_urllist(url_list):
urls = []
for url in url_list:
for r in url:
urls.append(r[5])
return urls
def get_urldict(urls):
d = {}
for url in urls:
d[url] = d.get(url, 0) + 1
return d
def url_count(logfile):
url_list = parser_logfile(logfile)
urls = parser_urllist(url_list)
totals = get_urldict(urls)
return totals
if __name__ == '__main__':
urls_with_counts = url_count('www.168seo.cn.log')
sorted_by_count = sorted(urls_with_counts.items(), key=itemgetter(1), reverse=True)
print(sorted_by_count)
"""
|