抓取链接
http://www.chinadaily.com.cn/a/201804/14/WS5ad15641a3105cdcf6518417.html
中英文文章及a标签中URL链接中的英文文章,利用这种迭代再迭代的方法获取网站中几乎所有英文文章
import
requests
from
bs4
import
BeautifulSoup
import
time
import
random
import
os
visited_urls
=
set()
def
get_raw_html(url):
if
url
not
in
visited_urls:
time.sleep(
2
)
response
=
requests.get(url)
visited_urls.add(url)
if
response.status_code
==
200
:
print(
"url:"
,url,
"okey"
)
return
BeautifulSoup(response.text,
"html.parser"
)
return
None
def
extract_urls(raw_html):
urls
=
raw_html.find_all(
'a'
)
result
=
set()
for
url
in
urls:
if
url.has_attr(
'href'
)
and
url[
'href'
].startswith(
'http://www.chinadaily.com.cn/a/'
):
#匹配开头相等字符串
result.add(url[
'href'
])
return
result
def
extract_content(raw_html):</