BuiltWith是一个网站技术信息查询工具,开发团队来自于澳大利亚。利用该网站所提供的功能,你可以查询出某网站背后是由哪些技术来支持的,比如操作系统的类型、所采用的访问量统计服务、采用的发布平台、广告平台、语言框架、聚合功能、页面文档信息、网站编码及操作系统信息等等。可帮助更深一层次了解该网站的运营情况。
但是这么好的工具是 python2 开发的,python3 的语法和 python2 有一些区别,所以 BuiltWith 不能在 python3 下直接使用,我们修改部分代码就可以完美兼容 python3 了
我的 python3 安装在 /usr/local/python/ 目录下,所以 BuiltWith 安装在 /usr/local/python/lib/python3.5/site-packages/builtwith 下。我们把这个目录下的 __init__.py 文件替换成下面的代码即可。
声明一下:以下配置在 CentOS 6测试通过,其他系统请随机应变
如果你还没安装pip,root用户可以用下面命令安装
首先通过pip安装BuiltWith
然后覆盖__init__.py文件
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
|
import
sys
import
os
import
re
import
json
import
urllib
.
request
import
urllib
.
error
import
chardet
def
builtwith
(
url
,
headers
=
None
,
html
=
None
,
user_agent
=
'builtwith'
)
:
"""Detect the technology used to build a website
>>> builtwith('http://wordpress.com')
{u'blogs': [u'PHP', u'WordPress'], u'font-scripts': [u'Google Font API'], u'web-servers': [u'Nginx'], u'javascript-frameworks': [u'Modernizr'], u'programming-languages': [u'PHP'], u'cms': [u'WordPress']}
>>> builtwith('http://webscraping.com')
{u'javascript-frameworks': [u'jQuery', u'Modernizr'], u'web-frameworks': [u'Twitter Bootstrap'], u'web-servers': [u'Nginx']}
>>> builtwith('http://microsoft.com')
{u'javascript-frameworks': [u'jQuery'], u'mobile-frameworks': [u'jQuery Mobile'], u'operating-systems': [u'Windows Server'], u'web-servers': [u'IIS']}
>>> builtwith('http://jquery.com')
{u'cdn': [u'CloudFlare'], u'web-servers': [u'Nginx'], u'javascript-frameworks': [u'jQuery', u'Modernizr'], u'programming-languages': [u'PHP'], u'cms': [u'WordPress'], u'blogs': [u'PHP', u'WordPress']}
>>> builtwith('http://joomla.org')
{u'font-scripts': [u'Google Font API'], u'miscellaneous': [u'Gravatar'], u'web-servers': [u'LiteSpeed'], u'javascript-frameworks': [u'jQuery'], u'programming-languages': [u'PHP'], u'web-frameworks': [u'Twitter Bootstrap'], u'cms': [u'Joomla'], u'video-players': [u'YouTube']}
"""
techs
=
{
}
# check URL
for
app_name
,
app_spec
in
data
[
'apps'
]
.
items
(
)
:
if
'url'
in
app_spec
:
if
contains
(
url
,
app_spec
[
'url'
]
)
:
add_app
(
techs
,
app_name
,
app_spec
)
# download content
if
None
in
(
headers
,
html
)
:
try
:
request
=
urllib
.
request
.
Request
(
url
,
None
,
{
'User-Agent'
:
user_agent
}
)
if
html
:
# already have HTML so just need to make HEAD request for headers
request
.
get_method
=
lambda
:
'HEAD'
response
=
urllib
.
request
.
urlopen
(
request
)
if
headers
is
None
:
headers
=
response
.
headers
if
html
is
None
:
html
=
response
.
read
(
)
encode_type
=
chardet
.
detect
(
html
)
if
encode_type
[
'encoding'
]
==
'utf-8'
:
html
=
html
.
decode
(
'utf-8'
)
else
:
html
=
html
.
decode
(
'gbk'
)
except
Exception
as
e
:
print
(
'Error:'
,
e
)
request
=
None
# check headers
if
headers
:
for
app_name
,
app_spec
in
data
[
'apps'
]
.
items
(
)
:
if
'headers'
in
app_spec
:
if
contains_dict
(
headers
,
app_spec
[
'headers'
]
)
:
add_app
(
techs
,
app_name
,
app_spec
)
# check html
if
html
:
for
app_name
,
app_spec
in
data
[
'apps'
]
.
items
(
)
:
for
key
in
'html'
,
'script'
:
snippets
=
app_spec
.
get
(
key
,
[
]
)
if
not
isinstance
(
snippets
,
list
)
:
snippets
=
[
snippets
]
for
snippet
in
snippets
:
if
contains
(
html
,
snippet
)
:
add_app
(
techs
,
app_name
,
app_spec
)
break
# check meta
# XXX add proper meta data parsing
metas
=
dict
(
re
.
compile
(
'<meta[^>]*?name=[\'"]([^>]*?)[\'"][^>]*?content=[\'"]([^>]*?)[\'"][^>]*?>'
,
re
.
IGNORECASE
)
.
findall
(
html
)
)
for
app_name
,
app_spec
in
data
[
'apps'
]
.
items
(
)
:
for
name
,
content
in
app_spec
.
get
(
'meta'
,
{
}
)
.
items
(
)
:
if
name
in
metas
:
if
contains
(
metas
[
name
]
,
content
)
:
add_app
(
techs
,
app_name
,
app_spec
)
break
return
techs
parse
=
builtwith
def
add_app
(
techs
,
app_name
,
app_spec
)
:
"""Add this app to technology
"""
for
category
in
get_categories
(
app_spec
)
:
if
category
not
in
techs
:
techs
[
category
]
=
[
]
if
app_name
not
in
techs
[
category
]
:
techs
[
category
]
.
append
(
app_name
)
implies
=
app_spec
.
get
(
'implies'
,
[
]
)
if
not
isinstance
(
implies
,
list
)
:
implies
=
[
implies
]
for
app_name
in
implies
:
add_app
(
techs
,
app_name
,
data
[
'apps'
]
[
app_name
]
)
def
get_categories
(
app_spec
)
:
"""Return category names for this app_spec
"""
return
[
data
[
'categories'
]
[
str
(
c_id
)
]
for
c_id
in
app_spec
[
'cats'
]
]
def
contains
(
v
,
regex
)
:
"""Removes meta data from regex then checks for a regex match
"""
return
re
.
compile
(
regex
.
split
(
'\\;'
)
[
0
]
,
flags
=
re
.
IGNORECASE
)
.
search
(
v
)
def
contains_dict
(
d1
,
d2
)
:
"""Takes 2 dictionaries
Returns True if d1 contains all items in d2"""
for
k2
,
v2
in
d2
.
items
(
)
:
v1
=
d1
.
get
(
k2
)
if
v1
:
if
not
contains
(
v1
,
v2
)
:
return
False
else
:
return
False
return
True
def
load_apps
(
filename
=
'apps.json.py'
)
:
"""Load apps from Wappalyzer JSON (https://github.com/ElbertF/Wappalyzer)
"""
# get the path of this filename relative to the current script
# XXX add support to download update
filename
=
os.path
.
join
(
os
.
getcwd
(
)
,
os.path
.
dirname
(
__file__
)
,
filename
)
return
json
.
load
(
open
(
filename
)
)
data
=
load_apps
(
)
if
__name__
==
'__main__'
:
urls
=
sys
.
argv
[
1
:
]
if
urls
:
for
url
in
urls
:
results
=
builtwith
(
url
)
for
result
in
sorted
(
results
.
items
(
)
)
:
print
(
'%s: %s'
%
result
)
else
:
print
(
'Usage: %s url1 [url2 url3 ...]'
%
sys
.
argv
[
0
]
)
|
在网页编码时需要通过 chardet.detect() 获取原网页的编码
需要安装 chardet 包,我们通过 pip 安装
下面我们测试一下BuiltWith