引言
三年前,我参加了数学建模校队,当时同队的小兄弟想让我学爬虫,然后太菜的我被劝退了。从数学系毕业,到软院读研,学到了前端的知识以后,发现爬虫似乎并不难,最近因为论文需要,写了第一个爬虫,很开心。
目标
我爬取了区块链POW上的交易信息。
可以从 blockchain.info 上找到数据。目前爬出的是脏数据,需要进一步处理。
代码
代码运行在python 3.7上
import requests
from bs4 import BeautifulSoup
import re
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
return r.text
except:
return ""
def main(path):
url = 'https://www.blockchain.com/zh/btc/blocks/1551262244728'
start_url = 'https://www.blockchain.com'
try:
html = getHTMLText(url)
l1 = re.findall(r'/zh/btc/block/.*?\"', html)
num = 0
for x in l1:
my_url = start_url + x[:-1]
my_html = getHTMLText(my_url)
l2 = re.findall(r'<span class=\"pull-right\">.*?/zh/btc/tx', my_html)
for y in l2:
l3 = list()
l3.append(re.findall(r'<span class=\"pull-right\">.*?</span>', y))
l3.append(re.findall(r'/zh/btc/address/.*?\"', y))
l3.append(re.findall(r'<span class=\"pull-right hidden-phone\">.*?BTC', y))
for z in l3:
with open(path, 'a', encoding='utf-8') as f:
for g in z:
f.write(str(g) + '\n')
f.write('\n')
num += 1
print(num / len(l1))
except:
pass
if __name__ == '__main__':
output_file = 'D:/myBlock.txt'
main(output_file)