Python 简单的爬虫案例——获取历史天气(二)_历史天气数据爬虫-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_42062353/article/details/82084391

Python 简单的爬虫案例——获取历史天气(二)
1 上节回顾
上节的目的是获得天气网的城市链接和城市名称，分别存放到arrCityLink和arrCityName中，代码如下：

import requests
import bs4
response = requests.get('http://lishi.tianqi.com/')
soup = bs4.BeautifulSoup(response.text,"html.parser")

arrCityName = [] #城市名称
arrCityLink = [] #城市链接


#选出内容所在区域
for tagone in soup.find_all('div', id="cityall"):
    for tagtwo in tagone.find_all('div',id="tool_site"):
        #获取城市链接
        for a in tagtwo.select('a'):
            if a.get('href') != '#':
                print(a.get('href'))
                arrCityLink.append(a.get('href'))
        #获取城市名称
        for a in tagtwo.select('a'):
            if a.text != 'A' and a.text != 'B' and a.text!= 'C' and a.text!= 'D' \
               and a.text!= 'E' and a.text!= 'F' and a.text!= 'G' and a.text!= 'H' \
               and a.text!= 'I' and a.text!= 'J' and a.text!= 'K' and a.text!= 'L' \
               and a.text!= 'M' and a.text!= 'N' and a.text!= 'O' and a.text!= 'P' \
               and a.text!= 'Q' and a.text!= 'R' and a.text!= 'S' and a.text!= 'T' \
               and a.text!= 'U' and a.text!= 'V' and a.text!= 'W' and a.text!= 'X' \
               and a.text!= 'Y' and a.text!= 'Z' and a.text != 'h':
                print(a.text)
                arrCityName.append(a.text)

接下来讲述如何获取获取月份链接、获取天气状况并存到txt文本中。
2 获取月份链接

![这里写图片描述](https://img-blog.youkuaiyun.com/20180826203559220?watermark/2/text/aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MjA2MjM1Mw==/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70) ![这里写图片描述](https://img-blog.youkuaiyun.com/20180826203614199?watermark/2/text/aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MjA2MjM1Mw==/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70) 观察源码，我们发现月份链接在class=“tqtongji1”下的a标签中，且a标签的text文本有个特点，都遵循****年**月天气，因此可以根据该特点获得月份链接，此处知识在上一节已经讲过，这里不再解释，代码如下：

import requests
import bs4

url = 'http://lishi.tianqi.com/acheng/index.html'
response = requests.get(url)   
soup = bs4.BeautifulSoup(response.text,"html.parser")

    
#选出对应部分内容
for tagone in soup.find_all('div', class_='tqtongji1'):
    for a in tagone.select('a'):
        temp = a.text
        tempTwo = '' 
        tempTwo = tempTwo + temp[4]
        tempTwo = tempTwo + temp[7]
        #判断text文本是否‘为****年**月天气’的格式
        if tempTwo == '年月':
            print(a.get('href'))
            print(a.text)

结果如下所示：

![这里写图片描述](https://img-blog.youkuaiyun.com/20180826204201480?watermark/2/text/aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MjA2MjM1Mw==/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70) **3 获取天气状况** 以阿城7月份天气为例，页面和页面源码如下： ![这里写图片描述](https://img-blog.youkuaiyun.com/20180826204811130?watermark/2/text/aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MjA2MjM1Mw==/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70) ![这里写图片描述](https://img-blog.youkuaiyun.com/20180826204821977?watermark/2/text/aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MjA2MjM1Mw==/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70) 从源码中我们能够看出天气情况在代码块class="tqtongji2"下的li标签中，且每5个为一天的天气情况，具体方法和上述类似，此处不再说明，代码见第4部分。不过需要注意如何将信息存储到txt文本中，才能方便数据导入到EXCEL中进行处理，此处采用逗号（，）将信息隔开（例如：2018-07-01,32,21,多云,东南风,2级）。 **4 全部代码**

import requests
import bs4

def SaveData(path, url):
    response = requests.get(url)
    soup = bs4.BeautifulSoup(response.text,"html.parser")

    dayOne = []
    dayTwo = []

    #选出对应部分内容，获得天气数据
    for tagthree in soup.find_all('div', class_='tqtongji2'):
    
        for a in tagthree.select('li'):
             dayOne.append(a.text)
        
    for i in range(6, len(dayOne)):
        dayTwo.append(dayOne[i])

    
    file = open(path,'r+')
    file.read()#指针到末尾
    #file.write( '\n')#换行

    for i in range(0, len(dayTwo)):
        if i%6 == 5:
            file.write(str(dayTwo[i]) + '\n')
        else:
            file.write(str(dayTwo[i]) + ',')

    file.close()


response = requests.get('http://lishi.tianqi.com/')
soup = bs4.BeautifulSoup(response.text,"html.parser")

arrName = []
arrLink = []

arrCityName = [] #城市名称
arrCityLink = [] #城市链接

for a in soup.select('a'):
    #print(a.get('href'))
    arrLink.append(a.get('href'))

for a in soup.select('a'):
    #print(a.text)
    arrName.append(a.text)

for i in range(42,456):
    if arrLink[i]!= '#':
        arrCityLink.append(arrLink[i])
        #print(arrLink[i])
#print(len(arrCityName))

for i in range(0,len(arrName)):
    if arrName[i] == 'A':
        print(i)    

for i in range(0,len(arrName)):
    if arrName[i] == '忠县':
        print(i)

for i in range(41,456):
    if arrName[i]!= 'A' and arrName[i]!= 'B' and arrName[i]!= 'C' and arrName[i]!= 'D' \
     and arrName[i]!= 'E' and arrName[i]!= 'F' and arrName[i]!= 'G' and arrName[i]!= 'H' \
      and arrName[i]!= 'I' and arrName[i]!= 'J' and arrName[i]!= 'K' and arrName[i]!= 'L' \
       and arrName[i]!= 'M' and arrName[i]!= 'N' and arrName[i]!= 'O' and arrName[i]!= 'P' \
        and arrName[i]!= 'Q' and arrName[i]!= 'R' and arrName[i]!= 'S' and arrName[i]!= 'T' \
         and arrName[i]!= 'U' and arrName[i]!= 'V' and arrName[i]!= 'W' and arrName[i]!= 'X' \
         and arrName[i]!= 'Y' and arrName[i]!= 'Z' and arrName[i] != 'h':
        arrCityName.append(arrName[i])
        #print(arrName[i])
print(len(arrCityLink))
print(len(arrCityName))

for i in range(len(arrCityLink)):
    response = requests.get(arrCityLink[i])   
    soup = bs4.BeautifulSoup(response.text,"html.parser")

    #新建txt文本
    path = 'E:/tempt/' + arrCityName[i] + '.txt'
    file = open(path,'w')
    file.close()
    
    #选出对应部分内容
    for tagthree in soup.find_all('div', class_='tqtongji1'):
        for a in tagthree.select('a'):
            temp = a.text
            tempTwo = '' 
            tempTwo = tempTwo + temp[4]
            tempTwo = tempTwo + temp[7]
            #print(tempTwo)
            if tempTwo == '年月':
                print(a.get('href'))
                print(a.text)
                
                #保存数据
                SaveData(path, a.get('href'))

谢谢！祝各位天天开心。