首先在数据库中创建一张表(Spots),列名分别是spot, url, ident.
创建mapping
#spoturl.py
from sqlalchemy import Column, String, INT, and_
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
#创建实体类,对应DB中的字段
class Spots(Base):
__tablename__ = 'Spots'
id = Column(INT(), primary_key=True)
spot = Column(String()) #景点名称
url = Column(String()) #景点连接
ident = Column(INT()) #标识符
def __init__(self, spot, url, ident):
self.spot = spot
self.url = url
self.ident = ident
def get_equal_filter(self):
func = and_(Spots.spot == self.spot,
Spots.url == self.url,
Spots.ident == self.ident)
return func
连接数据库
#database.py
import sqlalchemy, types
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
class DataBase():
def __init__(self):
self.engine = create_engine('mssql+pymssql://mstftDB:Qual1234@10.233.156.73/testmstdb', echo=True) #此处填写自己数据库的配置,这只是实例
DBSession = sessionmaker(bind=self.engine)
self.session = DBSession()
def add_item(self, obj):
self.session.add(obj)
self.session.commit()
def query_all(self, target_class):
result_list = self.session.query(target_class).all()
return result_list
def is_in_database(self, obj):
if(hasattr(obj, "get_equal_filter")):
obj_filter = obj.get_equal_filter()
objs_list = self.session.query(obj.__class__).filter(obj_filter).all()
if(not objs_list):
return False
else:
return True
else:
raise Exception("Cannot find \"get_equal_filter\" in class")
def update_item(self, obj, update_hash):
database_obj = self.get_database_obj(obj)
if (not database_obj):
database_obj = obj
for each_key in update_hash.keys():
code = "database_obj.%s=update_hash[\"%s\"]" % (each_key, each_key)
exec(code)
self.session.add(database_obj)
self.session.commit()
return database_obj
def get_database_obj(self, obj):
if(hasattr(obj, "get_equal_filter")):
obj_filter = obj.get_equal_filter()
objs_list = self.session.query(obj.__class__).filter(obj_filter).first()
if(objs_list):
return objs_list
else:
return None
else:
raise Exception("Cannot find \"get_equal_filter\" in class" )
return None
主程序
#spot.py
import requests
import itchat
import time
from random import choice
from bs4 import BeautifulSoup
from selenium import webdriver
from database import DataBase
from spoturl import Spots
class Spot:
def __init__(self):
self.db_obj = DataBase()
def get_soup(self, url, retry_nums=2):
# options = Options()
# options.add_argument('-headless') # 无头参数
# driver = Firefox(executable_path='C:/YKP/software/geckodriver.exe',
# firefox_options=options) # 配了环境变量第一个参数就可以省了,不然传绝对路径
time.sleep(5)
try: #抓取页面涉及到JS,所以采用selenium结合无头浏览器来解析网页
driver = webdriver.PhantomJS(executable_path='C:/YKP/software/phantomjs-2.1.1-windows/bin/phantomjs.exe')
driver.get(url)
html = driver.page_source
driver.quit()
soup = BeautifulSoup(html, "html.parser")
except requests.HTTPError as e:
print('Download error'+e.__str__())
soup = None
if retry_nums > 0:
if hasattr(e, 'code') and 500 <= e.code <= 600:
return self.get_soup(url, retry_nums-1)
return soup
def get_city(self, url): #先获得所有城市的链接
soup = self.get_soup(url)
txt_links = soup.findAll("a", {"class", "link"})
cityurls = {}
for city in txt_links[1:]:
cityname = city.text
url = city['href']
cityurls[cityname] = url
return cityurls
def get_spot(self, url): #在每个城市中找出热门景点的链接
addr_dict = self.get_city(url)
for (addrs, addrsUrl) in addr_dict.items():
print(addrs)
time.sleep(1)
soup = self.get_soup(addrsUrl)
try:
spots_line = soup.find("dl", {"class", "line clrfix"})
spots_tag = spots_line.findAll('dd', {'class', 'tag show'})
except:
print("烂地,一个景点都没有!")
else:
for spot_tag in spots_tag:
spot_a = spot_tag.find('a')
spotname = spot_a.text
spoturl = spot_a['href']
self.add_db(spotname, spoturl, 0)
def add_db(self, spotname, spoturl, ident):
user = Spots(spotname, spoturl, ident)
if not self.db_obj.is_in_database(user):
print('spotName:' + spotname + '\n' + 'spotUrl:' + spoturl)
self.db_obj.add_item(user)
def query_url(self):
spot_and_urls = self.db_obj.query_all(Spots)
spot_and_url = choice(spot_and_urls)
spot_name = spot_and_url.spot
spot_url = spot_and_url.url
spot_iden = spot_and_url.ident
if spot_iden == 0: #如果景点没有发送过,就执行,发送完的景点将ident变成1
self.get_content(spot_name, spot_url)
user = Spots(spot_name, spot_url, spot_iden)
spot_dict = {}
spot_dict['spot'] = spot_name
spot_dict['url'] = spot_url
spot_dict['ident'] = 1
self.db_obj.update_item(user, spot_dict)
else:
self.query_url()
def get_content(self, name, url): #获取每个景点的介绍
print(url)
soup = self.get_soup(url)
try:
content_box = soup.find('div', {'class', 'e_db_content_box'}).findAll('p')
except:
print('无介绍')
else:
content = ''
for con in content_box:
content = content+con.text+'\n'
if content == '':
print('无'+name+'景区详细介绍'+url)
self.query_url()
else:
print(content)
self.send_mas(content)
def send_mas(self, content): #通过itchat模块实现微信自动登录并发送信息给指定好友
itchat.auto_login(hotReload=True)
friends = ['好友备注'] #此处应该写自己好友的备注名
for friend in friends:
user = itchat.search_friends(friend)
username = user[0]["UserName"]
itchat.send(content, username)
if __name__ == '__main__':
spot = Spot()
#先执行以下两行,将所有景点链接存储到DB中,执行完后注释掉即可
# url = 'http://travel.qunar.com/place/'
# spot.get_spot(url)
#可通过schedule自定义发送的时间
# schedule.every().day.at("08:00").do(spot.query_url)
# schedule.every().day.at("12:00").do(spot.query_url)
# schedule.every().day.at("20:00").do(spot.query_url)
# while True:
# schedule.run_pending()
# time.sleep(1)
#也可通过第三方软件定时调用这个主程序
spot.query_url()