Python3 安装 BuiltWith 模块

最新推荐文章于 2025-02-11 18:10:38 发布

早起的鸟儿有虫吃h

最新推荐文章于 2025-02-11 18:10:38 发布

阅读量655

点赞数

分类专栏： Python爬虫文章标签： python 爬虫

Python爬虫专栏收录该内容

7 篇文章

订阅专栏

本文介绍如何在Python3环境下安装并配置BuiltWith模块，以实现网站技术信息的查询功能。文中提供了详细的步骤说明，并针对Python3的兼容性问题提出了具体的解决方案。

Python3 安装 BuiltWith 模块

原作者：https://www.iflei.com/python3-builtwith-module-installation.html

BuiltWith是一个网站技术信息查询工具，开发团队来自于澳大利亚。利用该网站所提供的功能，你可以查询出某网站背后是由哪些技术来支持的，比如操作系统的类型、所采用的访问量统计服务、采用的发布平台、广告平台、语言框架、聚合功能、页面文档信息、网站编码及操作系统信息等等。可帮助更深一层次了解该网站的运营情况。

但是这么好的工具是 python2 开发的，python3 的语法和 python2 有一些区别，所以 BuiltWith 不能在 python3 下直接使用，我们修改部分代码就可以完美兼容 python3 了

我的 python3 安装在 /usr/local/python/ 目录下，所以 BuiltWith 安装在 /usr/local/python/lib/python3.5/site-packages/builtwith 下。我们把这个目录下的 __init__.py 文件替换成下面的代码即可。

声明一下：以下配置在 CentOS 6测试通过，其他系统请随机应变

如果你还没安装pip，root用户可以用下面命令安装

1 2	wget https : / / bootstrap .pypa .io / get - pip .py python get - pip .py

首先通过pip安装BuiltWith

1	pip install builtwith

然后覆盖__init__.py文件

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

import sys

import os

import re

import json

import urllib . request

import urllib . error

import chardet

def builtwith ( url , headers = None , html = None , user_agent = 'builtwith' ) :

"""Detect the technology used to build a website

>>> builtwith('http://wordpress.com')

{u'blogs': [u'PHP', u'WordPress'], u'font-scripts': [u'Google Font API'], u'web-servers': [u'Nginx'], u'javascript-frameworks': [u'Modernizr'], u'programming-languages': [u'PHP'], u'cms': [u'WordPress']}

>>> builtwith('http://webscraping.com')

{u'javascript-frameworks': [u'jQuery', u'Modernizr'], u'web-frameworks': [u'Twitter Bootstrap'], u'web-servers': [u'Nginx']}

>>> builtwith('http://microsoft.com')

{u'javascript-frameworks': [u'jQuery'], u'mobile-frameworks': [u'jQuery Mobile'], u'operating-systems': [u'Windows Server'], u'web-servers': [u'IIS']}

>>> builtwith('http://jquery.com')

{u'cdn': [u'CloudFlare'], u'web-servers': [u'Nginx'], u'javascript-frameworks': [u'jQuery', u'Modernizr'], u'programming-languages': [u'PHP'], u'cms': [u'WordPress'], u'blogs': [u'PHP', u'WordPress']}

>>> builtwith('http://joomla.org')

{u'font-scripts': [u'Google Font API'], u'miscellaneous': [u'Gravatar'], u'web-servers': [u'LiteSpeed'], u'javascript-frameworks': [u'jQuery'], u'programming-languages': [u'PHP'], u'web-frameworks': [u'Twitter Bootstrap'], u'cms': [u'Joomla'], u'video-players': [u'YouTube']}

"""

techs = { }

# check URL

for app_name , app_spec in data [ 'apps' ] . items ( ) :

if 'url' in app_spec :

if contains ( url , app_spec [ 'url' ] ) :

add_app ( techs , app_name , app_spec )

# download content

if None in ( headers , html ) :

try :

request = urllib . request . Request ( url , None , { 'User-Agent' : user_agent } )

if html :

# already have HTML so just need to make HEAD request for headers

request . get_method = lambda : 'HEAD'

response = urllib . request . urlopen ( request )

if headers is None :

headers = response . headers

if html is None :

html = response . read ( )

encode_type = chardet . detect ( html )

if encode_type [ 'encoding' ] == 'utf-8' :

html = html . decode ( 'utf-8' )

else :

html = html . decode ( 'gbk' )

except Exception as e :

print ( 'Error:' , e )

request = None

# check headers

if headers :

for app_name , app_spec in data [ 'apps' ] . items ( ) :

if 'headers' in app_spec :

if contains_dict ( headers , app_spec [ 'headers' ] ) :

add_app ( techs , app_name , app_spec )

# check html

if html :

for app_name , app_spec in data [ 'apps' ] . items ( ) :

for key in 'html' , 'script' :

snippets = app_spec . get ( key , [ ] )

if not isinstance ( snippets , list ) :

snippets = [ snippets ]

for snippet in snippets :

if contains ( html , snippet ) :

add_app ( techs , app_name , app_spec )

break

# check meta

# XXX add proper meta data parsing

metas = dict ( re . compile ( '<meta[^>]*?name=[\'"]([^>]*?)[\'"][^>]*?content=[\'"]([^>]*?)[\'"][^>]*?>' , re . IGNORECASE ) . findall ( html ) )

for app_name , app_spec in data [ 'apps' ] . items ( ) :

for name , content in app_spec . get ( 'meta' , { } ) . items ( ) :

if name in metas :

if contains ( metas [ name ] , content ) :

add_app ( techs , app_name , app_spec )

break

return techs

parse = builtwith

def add_app ( techs , app_name , app_spec ) :

"""Add this app to technology

"""

for category in get_categories ( app_spec ) :

if category not in techs :

techs [ category ] = [ ]

if app_name not in techs [ category ] :

techs [ category ] . append ( app_name )

implies = app_spec . get ( 'implies' , [ ] )

if not isinstance ( implies , list ) :

implies = [ implies ]

for app_name in implies :

add_app ( techs , app_name , data [ 'apps' ] [ app_name ] )

def get_categories ( app_spec ) :

"""Return category names for this app_spec

"""

return [ data [ 'categories' ] [ str ( c_id ) ] for c_id in app_spec [ 'cats' ] ]

def contains ( v , regex ) :

"""Removes meta data from regex then checks for a regex match

"""

return re . compile ( regex . split ( '\\;' ) [ 0 ] , flags = re . IGNORECASE ) . search ( v )

def contains_dict ( d1 , d2 ) :

"""Takes 2 dictionaries

Returns True if d1 contains all items in d2"""

for k2 , v2 in d2 . items ( ) :

v1 = d1 . get ( k2 )

if v1 :

if not contains ( v1 , v2 ) :

return False

else :

return False

return True

def load_apps ( filename = 'apps.json.py' ) :

"""Load apps from Wappalyzer JSON (https://github.com/ElbertF/Wappalyzer)

"""

# get the path of this filename relative to the current script

# XXX add support to download update

filename = os.path . join ( os . getcwd ( ) , os.path . dirname ( __file__ ) , filename )

return json . load ( open ( filename ) )

data = load_apps ( )

if __name__ == '__main__' :

urls = sys . argv [ 1 : ]

if urls :

for url in urls :

results = builtwith ( url )

for result in sorted ( results . items ( ) ) :

print ( '%s: %s' % result )

else :

print ( 'Usage: %s url1 [url2 url3 ...]' % sys . argv [ 0 ] )

在网页编码时需要通过 chardet.detect() 获取原网页的编码

需要安装 chardet 包，我们通过 pip 安装

1	pip install chardet

下面我们测试一下BuiltWith

Python3 安装 BuiltWith 模块