1. 主类Spider
import os
import random
import requests
import xlrd
from lxml import etree
from pymongo import MongoClient
from retrying import retry
from com.medchat.Settings import *
from com.medchat.util.dama import indetify
class OrganizationTypeSpider:
"""机构类型搜索的爬虫"""
def __init__(self):
self.url = "http://zgcx.nhfpc.gov.cn:9090/unit/index"
self.code_url = "http://zgcx.nhfpc.gov.cn:9090/CaptchaGenerate/Generate/"
self.headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, "
"like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36"}
self.session = None
self.client = MongoClient(host="192.168.1.27", port=27017)
self.collection = self.client[MONGODB_CAPTCHA_CODE][MONGODB_TABLE_CAPTCHA_CODE]
self.open_exception = False
@retry(stop_max_attempt_number=3)
def run(self, hospital_name, province_name):
"""
搜索指定医院的机构类型
:param hospital_name: 医院名称
:param province_name: 所在省份 参考 Settings.py PROVINCE_CODE_MAP
:return:
"""
self.open_exception = False
try:
if len(hospital_name.strip()) < 4:
return INVALID_HOSPITAL, ""
self.session = requests.Session()
self.headers["User-Agent"] = random.choice(USER_AGENT_LIST)
response = self.session.get(self.url, headers=self.headers)
html = etree.HTML(response.content.decode())
__RequestVerificationToken = html.xpath("//input[@name='__RequestVerificationToken']/@value")[0]
# 获取验证码并识别
response = self.session.get(self.code_url, headers=self.headers)
cid, Check_Code = indetify(response.content)
if not CAPTCHA_CODE_MAP.get(cid):
# 验证识别成功(可以保存自己的验证码了)
size = str(len(os.listdir(CAPTCHA_PATH)))
with open(CAPTCHA_PATH + "/" + size + ".jpg", "wb") as f:
f.write(response.content)
self.collection.insert({"id": size, "code": Check_Code})
data = {'__RequestVerificationToken': __RequestVerificationToken,
'Prov': PROVINCE_CODE_MAP[province_name],
'Check_Code': Check_Code,
'Unit_Name': "".join(hospital_name.replace(" ", ""))}
response = requests.post(self.url, headers=self.headers, data=data, cookies=self.session.cookies)
html = etree.HTML(response.content.decode())
items = []
cls = ""
text = html.xpath("/html/body/div/div[2]/form/div/div[1]/div[4]/div/div/ul/li/text()")
print("text:", text)
if ERROR_1 in text or ERROR_2 in text:
self.open_exception = True
trs = html.xpath("//table[contains(@class,'table-bordered')]//tbody/tr")
for tr in trs:
item_dict = {'province': tr.xpath("./td/text()")[0],
'examing_approving_org': tr.xpath("./td/text()")[1],
'hospital': tr.xpath("./td/text()")[2], 'cls': tr.xpath("./td/text()")[3]}
if hospital_name == item_dict['hospital']:
cls = item_dict['cls']
items.append(item_dict)
if not cls:
cls = INVALID_HOSPITAL_NAME
text1 = html.xpath("/html/body/div/div[2]/div[2]/h3/text()")
print("text1", text1)
if NO_HOSPITAL in text1:
cls = NO_HOSPITAL
else:
# 验证码识别有问题
self.open_exception = True
finally:
try:
self.session.close()
self.session = None
except Exception as e:
pass
if self.open_exception:
a = 1 / 0
return cls, items
def import_excel():
files = os.listdir(EXCEL_MEDCHAT_DATA)
items = []
for filename in files:
filename = EXCEL_MEDCHAT_DATA + "/" + filename
print(filename)
if os.path.isfile(filename):
# 链接:https://www.cnblogs.com/nancyzhu/p/8401552.html
# 只能读不能写,打开一个excel
book = xlrd.open_workbook(filename)
# 根据顺序获取sheet
sheet = book.sheet_by_index(0)
# 根据sheet页名字获取sheet
# sheet = book.sheet_by_name(sheet_name)
for row in range(sheet.nrows):
if row < 1:
continue
item_dict = {}
for col in range(sheet.ncols):
try:
if col == 0:
item_dict['province_name'] = sheet.cell(row, col).value
item_dict['province_name'] = item_dict['province_name'].replace(" ", "")
elif col == 1:
item_dict['hospital_name'] = sheet.cell(row, col).value
item_dict['hospital_name'] = item_dict['hospital_name'].replace(" ", "")
except Exception as e:
print(e)
items.append(item_dict)
return items
if __name__ == '__main__':
spider = OrganizationTypeSpider()
client = MongoClient(host="192.168.1.27", port=27017)
collection = client[MONGODB_MEDCHAT_DB_1][MONGODB_MEDCHAT_TABLE_1]
original_items = import_excel()
print("即将要处理 {} 条医院数据,请耐心等待~".format(len(original_items)))
count = 0
for original_item in original_items:
result_cls = ""
result_items = []
try:
# result_cls, items = spider.run(original_item['hospital_name'].strip(), original_item['province_name'].strip())
result_cls, result_items = spider.run(original_item['hospital_name'].strip(), "广西壮族自治区")
except Exception as e:
result_cls = THREE_TIMES
finally:
hospitals = ""
clazzs = ""
if result_items:
for item in result_items:
hospitals = hospitals + item['hospital'] + "***"
clazzs = clazzs + item['cls'] + "***"
result = {'province_name': original_item['province_name'],
'hospital_name': original_item['hospital_name'].strip(),
'cls': result_cls, 'hospitals': hospitals[:-3], 'clss': clazzs[:-3], "items": result_items}
print(result)
collection.insert(result)
count += 1
print("%s 的机构级别:%s" % (original_item['hospital_name'].strip(), result_cls))
if len(original_items) - count == 0:
print("恭喜您,{}条医院数据已经处理完成!".format(len(original_items)))
else:
print("还有 {} 条医院数据等待处理,请耐心等待~".format(len(original_items) - count))
print("*" * 100)
2. 打码用的类
# coding=utf-8
import http.client, mimetypes, urllib, json, time, requests
######################################################################
class YDMHttp:
apiurl = 'http://api.yundama.com/api.php'
username = ''
password = ''
appid = ''
appkey = ''
def __init__(self, username, password, appid, appkey):
self.username = username
self.password = password
self.appid = str(appid)
self.appkey = appkey
def request(self, fields, files=[]):
response = self.post_url(self.apiurl, fields, files)
response = json.loads(response)
return response
def balance(self):
data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid,
'appkey': self.appkey}
response = self.request(data)
if (response):
if (response['ret'] and response['ret'] < 0):
return response['ret']
else:
return response['balance']
else:
return -9001
def login(self):
data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid,
'appkey': self.appkey}
response = self.request(data)
if (response):
if (response['ret'] and response['ret'] < 0):
return response['ret']
else:
return response['uid']
else:
return -9001
def upload(self, filename, codetype, timeout):
data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid,
'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
file = {'file': filename}
response = self.request(data, file)
if (response):
if (response['ret'] and response['ret'] < 0):
return response['ret']
else:
return response['cid']
else:
return -9001
def result(self, cid):
data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid,
'appkey': self.appkey, 'cid': str(cid)}
response = self.request(data)
return response and response['text'] or ''
def decode(self, filename, codetype, timeout):
cid = self.upload(filename, codetype, timeout)
if (cid > 0):
for i in range(0, timeout):
result = self.result(cid)
if (result != ''):
return cid, result
else:
time.sleep(1)
return -3003, ''
else:
return cid, ''
def post_url(self, url, fields, files=[]):
# for key in files:
# files[key] = open(files[key], 'rb');
res = requests.post(url, files=files, data=fields)
return res.text
######################################################################
# 用户名
username = '******'
# 密码
password = '******'
appId = 8854 # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
appKey = 'f3c0afb03c6035748679e859ce3a1285' # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
# 验证码类型
codetype = 3007
# 超时
timeout = 60
def indetify(response_content):
if (username == 'username'):
print('请设置好相关参数再测试')
else:
# 初始化
yundama = YDMHttp(username, password, appId, appKey)
# 登陆云打码
uid = yundama.login();
print('uid: %s' % uid)
# 查询余额
balance = yundama.balance();
print('balance: %s' % balance)
# 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
cid, result = yundama.decode(response_content, codetype, timeout)
print('cid: %s, result: %s' % (cid, result))
return cid, result
def indetify_by_filepath(file_path): # 打开注释
if (username == 'username'):
print('请设置好相关参数再测试')
else:
# 初始化
yundama = YDMHttp(username, password, appId, appKey)
# 登陆云打码
uid = yundama.login();
print('uid: %s' % uid)
# 查询余额
balance = yundama.balance();
print('balance: %s' % balance)
# 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
cid, result = yundama.decode(file_path, codetype, timeout)
print('cid: %s, result: %s' % (cid, result))
return result
######################################################################
if __name__ == '__main__':
url = "http://qian.sicent.com/Login/code.do"
content = requests.get(url).content
with open("test.png", "wb") as f:
f.write(content)
indetify(content)
3. 导出到excel的类
import json
import xlsxwriter
from pymongo import MongoClient
from com.medchat.Settings import *
def export_data_2_excel(filename, data_list):
"""导出数据"""
workbook = xlsxwriter.Workbook(filename)
worksheet = workbook.add_worksheet()
cols = ['省份', '医院名称', '医院等级', '医院信息', '等级信息', '备注']
for col_index, col in enumerate(cols):
worksheet.write(0, col_index, col)
for row_index, data in enumerate(data_list, start=1):
worksheet.write(row_index, 0, data['province_name'])
worksheet.write(row_index, 1, data['hospital_name'])
if data['cls'] is None:
data['cls'] = ""
worksheet.write(row_index, 2, data['cls'])
if data['hospitals'] is None:
data['hospitals'] = ""
worksheet.write(row_index, 3, data['hospitals'])
if data['clss'] is None:
data['clss'] = ""
worksheet.write(row_index, 4, data['clss'])
if data['items'] is None:
data['items'] = ""
result = json.dumps(data['items'], ensure_ascii=False)
result = result.replace("\"", "")
worksheet.write(row_index, 5, result)
workbook.close()
# 按照省份处理数据
PROVINCE_NAME = "广西壮族自治区"
PROVINCE_NAME_1 = "广西"
if __name__ == '__main__':
client = MongoClient(host="192.168.1.27", port=27017)
collection = client[MONGODB_MEDCHAT_DB_1][MONGODB_MEDCHAT_TABLE_1]
ret = collection.find({"province_name": PROVINCE_NAME})
items = []
for item in ret:
items.append(item)
print(item)
export_data_2_excel("{}.xlsx".format(PROVINCE_NAME), items)
4. 处理数据的相关逻辑代码
from pymongo import MongoClient
import re
import xlsxwriter
from com.medchat.Settings import *
from com.medchat.spider.OrganizationTypeSpider import OrganizationTypeSpider
import copy
import json
import time
class DataTask:
def __init__(self):
self.client = MongoClient(host="192.168.1.27", port=27017)
self.collection = self.client[MONGODB_HOSPITAL_NAME][MONGODB_HOSPITAL_TABLE_NAME_5]
self.data_list = []
self.spider = OrganizationTypeSpider()
# 测试开关
self.flag = True
self.ret = self.collection.aggregate(
[{"$group": {
"_id": {"province_name": "$province_name", "city_name": "$city_name", "county_name": "$county_name",
"hospital_name_pre": "$hospital_name_pre", "hospital_name": "$hospital_name",
"alias_name": "$alias_name",
"organization_type": "$organization_type", "note": "$note",
"relation_organization": "$relation_organization"}}},
{"$project": {"province_name": "$_id.province_name", "city_name": "$_id.city_name",
"county_name": "$_id.county_name", "hospital_name_pre": "$_id.hospital_name_pre",
"hospital_name": "$_id.hospital_name",
"alias_name": "$_id.alias_name", "organization_type": "$_id.organization_type",
"note": "$_id.note", "relation_organization": "$_id.relation_organization",
"_id": 0}},
{"$sort": {"province_name": 1, "city_name": 1, "county_name": 1}}
])
def deal_with_alias_name(self, item):
"""处理医院别名"""
ret = re.match(r".*(?P<name1>[((].*[))]).*", item['hospital_name_pre'])
# 包含括号的长度2
if ret is not None and len(ret.group(1)) > 3:
item['alias_name'] = ret.group(1).replace("(", "").replace(")", "").replace("(", "").replace(")", "")
def deal_with_hospital_name_post(self, item):
"""处理医院名称"""
if '医院' not in item['hospital_name']:
item['hospital_name_post'] = copy.deepcopy(item['hospital_name'])
p_len = len(item['province_name'])
c_len = len(item['city_name'])
cc_len = len(item['county_name'])
"""
思路:1. 比较三者的长度(核心)
1.1. 三者长度相等(替换顺序无所谓), 但是去掉省市县后还得替换一次
1.2. 有两个长度相等(是一个的长度长,还是两个相等的长)
1.2.1. 两个的长
1.2.1.1. 先替换两个相同的, 再替换短的, 但是去掉省市县后还得替换一次
1.2.2. 一个的长
1.2.2.1. 先替换长的,再替换两个相同的
1.3. 长度各不相同
依次替换长到短
"""
if p_len == c_len == cc_len:
item['hospital_name_post'] = item['hospital_name_post'].replace(item['province_name'], "").replace(
item['city_name'], "").replace(item['county_name'], "")
item['hospital_name_post'] = item['hospital_name_post'].replace(item['province_name'][:-1], "").replace(
item['city_name'][:-1], "").replace(item['county_name'][:-1], "")
elif p_len == c_len and p_len < cc_len:
item['hospital_name_post'] = item['hospital_name_post'].replace(item['county_name'], "").replace(
item['city_name'], "").replace(item['province_name'], "")
item['hospital_name_post'] = item['hospital_name_post'].replace(item['county_name'][:-1], "").replace(
item['city_name'][:-1], "").replace(item['province_name'][:-1], "")
elif p_len == cc_len and p_len < c_len:
item['hospital_name_post'] = item['hospital_name_post'].replace(item['city_name'], "").replace(
item['county_name'], "").replace(item['province_name'], "")
item['hospital_name_post'] = item['hospital_name_post'].replace(item['city_name'][:-1], "").replace(
item['county_name'][:-1], "").replace(item['province_name'][:-1], "")
elif cc_len == c_len and c_len < p_len:
item['hospital_name_post'] = item['hospital_name_post'].replace(item['province_name'], "").replace(
item['county_name'], "").replace(item['city_name'], "")
item['hospital_name_post'] = item['hospital_name_post'].replace(item['province_name'][:-1], "").replace(
item['county_name'][:-1], "").replace(item['city_name'][:-1], "")
elif p_len > cc_len > c_len:
item['hospital_name_post'] = item['hospital_name_post'].replace(item['province_name'], "").replace(
item['county_name'], "").replace(item['city_name'], "")
item['hospital_name_post'] = item['hospital_name_post'].replace(item['province_name'][:-1], "").replace(
item['county_name'][:-1], "").replace(item['city_name'][:-1], "")
elif c_len > p_len > cc_len:
item['hospital_name_post'] = item['hospital_name_post'].replace(item['city_name'], "").replace(