Python 简单的爬虫案例——获取历史天气(二)
1 上节回顾
上节的目的是获得天气网的城市链接和城市名称,分别存放到arrCityLink和arrCityName中,代码如下:
import requests
import bs4
response = requests.get('http://lishi.tianqi.com/')
soup = bs4.BeautifulSoup(response.text,"html.parser")
arrCityName = [] #城市名称
arrCityLink = [] #城市链接
#选出内容所在区域
for tagone in soup.find_all('div', id="cityall"):
for tagtwo in tagone.find_all('div',id="tool_site"):
#获取城市链接
for a in tagtwo.select('a'):
if a.get('href') != '#':
print(a.get('href'))
arrCityLink.append(a.get('href'))
#获取城市名称
for a in tagtwo.select('a'):
if a.text != 'A' and a.text != 'B' and a.text!= 'C' and a.text!= 'D' \
and a.text!= 'E' and a.text!= 'F' and a.text!= 'G' and a.text!= 'H' \
and a.text!= 'I' and a.text!= 'J' and a.text!= 'K' and a.text!= 'L' \
and a.text!= 'M' and a.text!= 'N' and a.text!= 'O' and a.text!= 'P' \
and a.text!= 'Q' and a.text!= 'R' and a.text!= 'S' and a.text!= 'T' \
and a.text!= 'U' and a.text!= 'V' and a.text!= 'W' and a.text!= 'X' \
and a.text!= 'Y' and a.text!= 'Z' and a.text != 'h':
print(a.text)
arrCityName.append(a.text)
接下来讲述如何获取获取月份链接、获取天气状况并存到txt文本中。
2 获取月份链接
import requests
import bs4
url = 'http://lishi.tianqi.com/acheng/index.html'
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text,"html.parser")
#选出对应部分内容
for tagone in soup.find_all('div', class_='tqtongji1'):
for a in tagone.select('a'):
temp = a.text
tempTwo = ''
tempTwo = tempTwo + temp[4]
tempTwo = tempTwo + temp[7]
#判断text文本是否‘为****年**月天气’的格式
if tempTwo == '年月':
print(a.get('href'))
print(a.text)
结果如下所示:
import requests
import bs4
def SaveData(path, url):
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text,"html.parser")
dayOne = []
dayTwo = []
#选出对应部分内容,获得天气数据
for tagthree in soup.find_all('div', class_='tqtongji2'):
for a in tagthree.select('li'):
dayOne.append(a.text)
for i in range(6, len(dayOne)):
dayTwo.append(dayOne[i])
file = open(path,'r+')
file.read()#指针到末尾
#file.write( '\n')#换行
for i in range(0, len(dayTwo)):
if i%6 == 5:
file.write(str(dayTwo[i]) + '\n')
else:
file.write(str(dayTwo[i]) + ',')
file.close()
response = requests.get('http://lishi.tianqi.com/')
soup = bs4.BeautifulSoup(response.text,"html.parser")
arrName = []
arrLink = []
arrCityName = [] #城市名称
arrCityLink = [] #城市链接
for a in soup.select('a'):
#print(a.get('href'))
arrLink.append(a.get('href'))
for a in soup.select('a'):
#print(a.text)
arrName.append(a.text)
for i in range(42,456):
if arrLink[i]!= '#':
arrCityLink.append(arrLink[i])
#print(arrLink[i])
#print(len(arrCityName))
for i in range(0,len(arrName)):
if arrName[i] == 'A':
print(i)
for i in range(0,len(arrName)):
if arrName[i] == '忠县':
print(i)
for i in range(41,456):
if arrName[i]!= 'A' and arrName[i]!= 'B' and arrName[i]!= 'C' and arrName[i]!= 'D' \
and arrName[i]!= 'E' and arrName[i]!= 'F' and arrName[i]!= 'G' and arrName[i]!= 'H' \
and arrName[i]!= 'I' and arrName[i]!= 'J' and arrName[i]!= 'K' and arrName[i]!= 'L' \
and arrName[i]!= 'M' and arrName[i]!= 'N' and arrName[i]!= 'O' and arrName[i]!= 'P' \
and arrName[i]!= 'Q' and arrName[i]!= 'R' and arrName[i]!= 'S' and arrName[i]!= 'T' \
and arrName[i]!= 'U' and arrName[i]!= 'V' and arrName[i]!= 'W' and arrName[i]!= 'X' \
and arrName[i]!= 'Y' and arrName[i]!= 'Z' and arrName[i] != 'h':
arrCityName.append(arrName[i])
#print(arrName[i])
print(len(arrCityLink))
print(len(arrCityName))
for i in range(len(arrCityLink)):
response = requests.get(arrCityLink[i])
soup = bs4.BeautifulSoup(response.text,"html.parser")
#新建txt文本
path = 'E:/tempt/' + arrCityName[i] + '.txt'
file = open(path,'w')
file.close()
#选出对应部分内容
for tagthree in soup.find_all('div', class_='tqtongji1'):
for a in tagthree.select('a'):
temp = a.text
tempTwo = ''
tempTwo = tempTwo + temp[4]
tempTwo = tempTwo + temp[7]
#print(tempTwo)
if tempTwo == '年月':
print(a.get('href'))
print(a.text)
#保存数据
SaveData(path, a.get('href'))
谢谢!祝各位天天开心。