写一个爬虫程序,访问广西空气质量实时发布系统 网页实时获取南宁市各个站点的PM2.5的值(要在后台运行,实时获取)把获取的值存放到Python内置的数据库里面,本人没学过python,于是我花了两天的时间,绞尽脑汁,这也是我人生中第一个python程序。
首先要做准备工作,去我的资源库里找到python爬虫并在后台运行所需要的材料并下载,把里面的三个文件夹复制到python安装的的lib文件夹里面(注意是lib不是libs),如下图
然后再把里面的pywin32-220.win-amd64-py3.5.exe安装到python的安装目录下,这个主要是用来在后台运行的,顺便也把SQLite这个也安装吧,方便测试。
好,准备工作完成,直接上代码:
这个类主要是用于让程序在后台运行的:
class PmService(win32serviceutil.ServiceFramework):
#服务名
_svc_name_ = "PmService"
#服务显示名称
_svc_display_name_ = "PmServiceDemo"
#服务描述
_svc_description_ = "Python service demo."
def __init__(self, args):
win32serviceutil.ServiceFramework.__init__(self, args)
self.hWaitStop = win32event.CreateEvent(None, 0, 0, None)
self.logger = self._getLogger()
self.isAlive = True
def _getLogger(self):
import logging
import os
import inspect
logger = logging.getLogger('[PmService]')
this_file = inspect.getfile(inspect.currentframe())
dirpath = os.path.abspath(os.path.dirname(this_file))
handler = logging.FileHandler(os.path.join(dirpath, "service.log"))
formatter = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
return logger
def SvcDoRun(self):
import time
self.logger.error("svc do run....")
print('正在后台执行。。。。。。')
while self.isAlive:
self.logger.error("I am alive.")
time.sleep(1)
url ='http://www.gxepb.gov.cn/AQI/Ashx/AQIDataService.ashx?action=GetAllDatas&areaName=%E5%8D%97%E5%AE%81%E5%B8%82'
html = get_content(url)
result = []
result = get_data(html)
timelist = ','.join(result[0])
mytime = timelist.split(',')
displaylist = ",".join(result[1])
mydisplaylist = displaylist.split(',')
pm = ",".join(result[2])
mypm = pm.split(',')
aaf = html.split('}') #把获取到的网络数据进行分段
k = 0 #记录pm2.5不为null的个数
n = 0 #记录pm2.5为null的为第几行数
m = 0 #记录pm2.5为null时候的个数
for ff in aaf:
if (len(aaf) - k) == 1: #因为aaf的长度要比站点的个数mydisplaylist大1,所以当要少做一次循环,避免数据溢出
break
lla = re.findall('\"PM25_1H\":null',aaf[k]) #用正规表达式查打有没有pm2.5为空的值
if lla: #循环判断截取到的字符段是否有null值,如果有,则把pm2.5的值有“--”表示
n = k
add_db(n+1,mydisplaylist[n],"--",mytime[1]) #检测到pm2.5的值为null则马上更新数据库
m += 1;
else: #没有null则把pm2.5的值更新到数据
add_db(k+1,mydisplaylist[k],mypm[k-m],mytime[1])
k += 1
win32event.WaitForSingleObject(self.hWaitStop, win32event.INFINITE)
def SvcStop(self):
# 先告诉SCM停止这个过程
self.logger.error("svc do stop....")
self.ReportServiceStatus(win32service.SERVICE_STOP_PENDING)
# 设置事件
win32event.SetEvent(self.hWaitStop)
self.isAlive = False
def get_content(url , data = None):
header={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235'
}
timeout = random.choice(range(80, 180))
while True:
try:
rep = requests.get(url,headers = header,timeout = timeout)
rep.encoding = 'utf-8'
break
except socket.timeout as e:
print( '3:', e)
time.sleep(random.choice(range(8,15)))
except socket.error as e:
print( '4:', e)
time.sleep(random.choice(range(20, 60)))
except http.client.BadStatusLine as e:
print( '5:', e)
time.sleep(random.choice(range(30, 80)))
except http.client.IncompleteRead as e:
print( '6:', e)
time.sleep(random.choice(range(5, 15)))
return rep.text
def get_data(html_text):
final = []
bs = BeautifulSoup(html_text, "html.parser") # 创建BeautifulSoup对象
body = bs.body # 获取body部分
p = re.compile(r'(?<=\"PM25_1H\":)\"(.+?)\"')
m = re.compile(r'(?<=\"DisplayName\":)\"(.+?)\"')
time = re.compile(r'(?<=\"AQIDATEFORMAT\":)\"(.+?)\"')
city = re.compile(r'(?<=\"StName\":)\"(.+?)\"')
imglist = re.findall(p,html_text) #得到pm2.5的值
pm = re.findall(m,html_text) #得到站点的名称
timelist = re.findall(time,html_text) #得到时间
final.append(timelist)
final.append(pm)
final.append(imglist) #将temp加到final中
return final
#数据库处理,pid:数据表的id,station:站点,pmtow:pm2.5,sendtime:发布的时间
def add_db(pid,station,pmtow,sendtime):
cx = sqlite3.connect("E:/mywork.db")
cu=cx.cursor()
try:
cu.execute("create table nanning_pm (id integer primary key,city text NULL,displayname text NULL,pm text NULL,time text NULL)")
cx.execute("insert into nanning_pm values (?,?,?,?,?)", (pid,'南宁',station,pmtow,sendtime))
except:
try:
cx.execute("insert into nanning_pm values (?,?,?,?,?)", (pid,'南宁',station,pmtow,sendtime))
except:
mid = str(pid)
sql = "update nanning_pm set city='南宁',displayname="+"'"+station+"'"+",pm="+"'"+pmtow+"'"+",time="+"'"+sendtime+"'"+" where id ="+mid
print (sql)
cu.execute(sql)
cx.commit()
return cu.fetchall();
if __name__=='__main__':
win32serviceutil.HandleCommandLine(PmService)
安装服务
python PmService.py install
让服务自动启动
python PmService.py --startup auto install
启动服务
python PythonService.py start
重启服务
python PmService.py restart
停止服务
python PmService.py stop
删除/卸载服务
python PmService.py remove