# -*- coding: utf-8 -*-
"""
Created on Fri Apr 27 15:12:18 2018
#python 3.6
"""
from requests_html import HTMLSession
from pymongo import MongoClient
import datetime
def createDB():
#创建数据库,返回三个集合,分别存放三个网站的数据
client = MongoClient('localhost',27017)
db = client.job
collection_ustc = db.ustc
collection_hfut = db.hfut
collection_ahu = db.ahu
return collection_ustc,collection_hfut,collection_ahu
def store(collection,_id,Theme,HoldDate,VenuesName,Description):
#将数据存入数据库相应集合
try:
data = {"_id":_id,"Theme":Theme,"HoldDate":HoldDate,"VenuesName":VenuesName,"Description":Description}
collection.save(data)
except:
pass
def isFuture(HoldDate,isFuture = False):
#判断宣讲会是否结束
time1 = datetime.datetime.now().date()
time2 = HoldDate.date()
if (time1 - time2).days < 0:
isFuture = True
return isFuture
def get_USTC_Info(col_ustc

本文介绍如何使用Python的requests_html库爬取合工大、安大、中科大的就业信息网宣讲会数据,并结合datetime模块判断是否已举办,已举办的不存入MongoDB数据库。通过HTMLSession的find()方法,按标签和class精准提取信息,重点讲解了如何定位和解析HTML中class属性为'panel-body'的div标签内容。
最低0.47元/天 解锁文章
297

被折叠的 条评论
为什么被折叠?



