<code> 对比 <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/re" title="View all posts in re" target="_blank">re</a></span> <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/lxml" title="View all posts in lxml" target="_blank">lxml</a></span> <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/beautifulsoup" title="View all posts in BeautifulSoup" target="_blank">BeautifulSoup</a></span>的解析速度 #coding:utf-8 import <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/re" title="View all posts in re" target="_blank">re</a></span>quests, <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/re" title="View all posts in re" target="_blank">re</a></span>, sys,time from bs4 import <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/beautifulsoup" title="View all posts in BeautifulSoup" target="_blank">BeautifulSoup</a></span> as bs4 # reload(sys) # sys.setdefaultencoding("utf-8") from <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/lxml" title="View all posts in lxml" target="_blank">lxml</a></span> import etree from pyquery import PyQuery as pq t1 = time.time() for i in range(1000): html = requests.get("http://cuiqingcai.com/2621.html").text pa = re.compile("<title>(.*?)</title>",re.S) print(re.search(pa,html).group(1),i) t2= time.time() t3 = t2-t1 time.sleep(30) t4 = time.time() for i in range(1000): html = requests.get("http://cuiqingcai.com/2621.html").content html = etree.HTML(html) print(html.xpath("//title/text()")[0],i) t5= time.time() t6 = t5-t4 t7= time.time() for i in range(1000): html = requests.get("http://cuiqingcai.com/2621.html").content soup = bs4(html,'<span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/lxml" title="View all posts in lxml" target="_blank">lxml</a></span>') print(soup.title.get_text(),i) t8= time.time() t9= t8-t7 # t10 = time.time() for i in range(1000): html = requests.get("http://cuiqingcai.com/2621.html").content doc = pq(html) print(doc('title').text(),i) t11 = time.time() t12 = t11-t10 print("re :"+str(t3),"lxml :"+str(t6),"bs4 lxml:"+str(t9),"PyQuery"+str(t12))</pre> </code>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
<
code
>
对比
re
lxml
BeautifulSoup的解析速度
#coding:utf-8
import
requests
,
re
,
sys
,
time
from
bs4
import
BeautifulSoup
as
bs4
# reload(sys)
# sys.setdefaultencoding("utf-8")
from
lxml
import
etree
from
pyquery
import
PyQuery
as
pq
t1
=
time
.
time
(
)
for
i
in
range
(
1000
)
:
html
=
requests
.
get
(
"http://cuiqingcai.com/2621.html"
)
.
text
pa
=
re
.
compile
(
"<title>(.*?)</title>"
,
re
.
S
)
print
(
re
.
search
(
pa
,
html
)
.
group
(
1
)
,
i
)
t2
=
time
.
time
(
)
t3
=
t2
-
t1
time
.
sleep
(
30
)
t4
=
time
.
time
(
)
for
i
in
range
(
1000
)
:
html
=
requests
.
get
(
"http://cuiqingcai.com/2621.html"
)
.
content
html
=
etree
.
HTML
(
html
)
print
(
html
.
xpath
(
"//title/text()"
)
[
0
]
,
i
)
t5
=
time
.
time
(
)
t6
=
t5
-
t4
t7
=
time
.
time
(
)
for
i
in
range
(
1000
)
:
html
=
requests
.
get
(
"http://cuiqingcai.com/2621.html"
)
.
content
soup
=
bs4
(
html
,
'lxml'
)
print
(
soup
.
title
.
get_text
(
)
,
i
)
t8
=
time
.
time
(
)
t9
=
t8
-
t7
#
t10
=
time
.
time
(
)
for
i
in
range
(
1000
)
:
html
=
requests
.
get
(
"http://cuiqingcai.com/2621.html"
)
.
content
doc
=
pq
(
html
)
print
(
doc
(
'title'
)
.
text
(
)
,
i
)
t11
=
time
.
time
(
)
t12
=
t11
-
t10
print
(
"re :"
+
str
(
t3
)
,
"lxml :"
+
str
(
t6
)
,
"bs4 lxml:"
+
str
(
t9
)
,
"PyQuery"
+
str
(
t12
)
)
<
/
pre
>
<
/
code
>
|