19. 卫健委官网医院查询爬虫+验证码识别(云打码)综合案例

1. 主类Spider

import os
import random

import requests
import xlrd
from lxml import etree
from pymongo import MongoClient
from retrying import retry

from com.medchat.Settings import *
from com.medchat.util.dama import indetify


class OrganizationTypeSpider:
    """机构类型搜索的爬虫"""

    def __init__(self):
        self.url = "http://zgcx.nhfpc.gov.cn:9090/unit/index"
        self.code_url = "http://zgcx.nhfpc.gov.cn:9090/CaptchaGenerate/Generate/"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, "
                          "like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36"}
        self.session = None
        self.client = MongoClient(host="192.168.1.27", port=27017)
        self.collection = self.client[MONGODB_CAPTCHA_CODE][MONGODB_TABLE_CAPTCHA_CODE]
        self.open_exception = False

    @retry(stop_max_attempt_number=3)
    def run(self, hospital_name, province_name):
        """
        搜索指定医院的机构类型
        :param hospital_name: 医院名称
        :param province_name: 所在省份 参考 Settings.py PROVINCE_CODE_MAP
        :return:
        """
        self.open_exception = False
        try:
            if len(hospital_name.strip()) < 4:
                return INVALID_HOSPITAL, ""

            self.session = requests.Session()
            self.headers["User-Agent"] = random.choice(USER_AGENT_LIST)

            response = self.session.get(self.url, headers=self.headers)
            html = etree.HTML(response.content.decode())
            __RequestVerificationToken = html.xpath("//input[@name='__RequestVerificationToken']/@value")[0]

            # 获取验证码并识别
            response = self.session.get(self.code_url, headers=self.headers)
            cid, Check_Code = indetify(response.content)

            if not CAPTCHA_CODE_MAP.get(cid):
                # 验证识别成功(可以保存自己的验证码了)
                size = str(len(os.listdir(CAPTCHA_PATH)))
                with open(CAPTCHA_PATH + "/" + size + ".jpg", "wb") as f:
                    f.write(response.content)
                self.collection.insert({"id": size, "code": Check_Code})

                data = {'__RequestVerificationToken': __RequestVerificationToken,
                        'Prov': PROVINCE_CODE_MAP[province_name],
                        'Check_Code': Check_Code,
                        'Unit_Name': "".join(hospital_name.replace(" ", ""))}

                response = requests.post(self.url, headers=self.headers, data=data, cookies=self.session.cookies)
                html = etree.HTML(response.content.decode())

                items = []
                cls = ""

                text = html.xpath("/html/body/div/div[2]/form/div/div[1]/div[4]/div/div/ul/li/text()")
                print("text:", text)
                if ERROR_1 in text or ERROR_2 in text:
                    self.open_exception = True

                trs = html.xpath("//table[contains(@class,'table-bordered')]//tbody/tr")
                for tr in trs:
                    item_dict = {'province': tr.xpath("./td/text()")[0],
                                 'examing_approving_org': tr.xpath("./td/text()")[1],
                                 'hospital': tr.xpath("./td/text()")[2], 'cls': tr.xpath("./td/text()")[3]}
                    if hospital_name == item_dict['hospital']:
                        cls = item_dict['cls']

                    items.append(item_dict)

                if not cls:
                    cls = INVALID_HOSPITAL_NAME

                text1 = html.xpath("/html/body/div/div[2]/div[2]/h3/text()")
                print("text1", text1)
                if NO_HOSPITAL in text1:
                    cls = NO_HOSPITAL

            else:
                # 验证码识别有问题
                self.open_exception = True
        finally:
             try:
                self.session.close()
                self.session = None
            except Exception as e:
                pass

        if self.open_exception:
            a = 1 / 0

        return cls, items


def import_excel():
    files = os.listdir(EXCEL_MEDCHAT_DATA)
    items = []
    for filename in files:
        filename = EXCEL_MEDCHAT_DATA + "/" + filename
        print(filename)
        if os.path.isfile(filename):
            # 链接:https://www.cnblogs.com/nancyzhu/p/8401552.html
            # 只能读不能写,打开一个excel
            book = xlrd.open_workbook(filename)
            # 根据顺序获取sheet
            sheet = book.sheet_by_index(0)
            # 根据sheet页名字获取sheet
            # sheet = book.sheet_by_name(sheet_name)
            for row in range(sheet.nrows):
                if row < 1:
                    continue
                item_dict = {}
                for col in range(sheet.ncols):
                    try:
                        if col == 0:
                            item_dict['province_name'] = sheet.cell(row, col).value
                            item_dict['province_name'] = item_dict['province_name'].replace(" ", "")
                        elif col == 1:
                            item_dict['hospital_name'] = sheet.cell(row, col).value
                            item_dict['hospital_name'] = item_dict['hospital_name'].replace(" ", "")

                    except Exception as e:
                        print(e)

                items.append(item_dict)

    return items


if __name__ == '__main__':
    spider = OrganizationTypeSpider()
    client = MongoClient(host="192.168.1.27", port=27017)
    collection = client[MONGODB_MEDCHAT_DB_1][MONGODB_MEDCHAT_TABLE_1]
    original_items = import_excel()
    print("即将要处理 {} 条医院数据,请耐心等待~".format(len(original_items)))
    count = 0

    for original_item in original_items:
        result_cls = ""
        result_items = []
        try:
            # result_cls, items = spider.run(original_item['hospital_name'].strip(), original_item['province_name'].strip())
            result_cls, result_items = spider.run(original_item['hospital_name'].strip(), "广西壮族自治区")
        except Exception as e:
            result_cls = THREE_TIMES
        finally:
            hospitals = ""
            clazzs = ""
            if result_items:
                for item in result_items:
                    hospitals = hospitals + item['hospital'] + "***"
                    clazzs = clazzs + item['cls'] + "***"

            result = {'province_name': original_item['province_name'],
                      'hospital_name': original_item['hospital_name'].strip(),
                      'cls': result_cls, 'hospitals': hospitals[:-3], 'clss': clazzs[:-3], "items": result_items}
            print(result)
            collection.insert(result)
            count += 1
            print("%s 的机构级别:%s" % (original_item['hospital_name'].strip(), result_cls))
            if len(original_items) - count == 0:
                print("恭喜您,{}条医院数据已经处理完成!".format(len(original_items)))
            else:
                print("还有 {} 条医院数据等待处理,请耐心等待~".format(len(original_items) - count))

            print("*" * 100)

2. 打码用的类

# coding=utf-8
import http.client, mimetypes, urllib, json, time, requests



######################################################################

class YDMHttp:
    apiurl = 'http://api.yundama.com/api.php'
    username = ''
    password = ''
    appid = ''
    appkey = ''

    def __init__(self, username, password, appid, appkey):
        self.username = username
        self.password = password
        self.appid = str(appid)
        self.appkey = appkey

    def request(self, fields, files=[]):
        response = self.post_url(self.apiurl, fields, files)
        response = json.loads(response)
        return response

    def balance(self):
        data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['balance']
        else:
            return -9001

    def login(self):
        data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['uid']
        else:
            return -9001

    def upload(self, filename, codetype, timeout):
        data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
        file = {'file': filename}
        response = self.request(data, file)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['cid']
        else:
            return -9001

    def result(self, cid):
        data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'cid': str(cid)}
        response = self.request(data)
        return response and response['text'] or ''

    def decode(self, filename, codetype, timeout):
        cid = self.upload(filename, codetype, timeout)
        if (cid > 0):
            for i in range(0, timeout):
                result = self.result(cid)
                if (result != ''):
                    return cid, result
                else:
                    time.sleep(1)
            return -3003, ''
        else:
            return cid, ''

    def post_url(self, url, fields, files=[]):
        # for key in files:
        #     files[key] = open(files[key], 'rb');
        res = requests.post(url, files=files, data=fields)
        return res.text


######################################################################

# 用户名
username = '******'

# 密码
password = '******'

appId = 8854   # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
appKey = 'f3c0afb03c6035748679e859ce3a1285'     # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!

# 验证码类型
codetype = 3007

# 超时
timeout = 60


def indetify(response_content):
    if (username == 'username'):
        print('请设置好相关参数再测试')
    else:
        # 初始化
        yundama = YDMHttp(username, password, appId, appKey)

        # 登陆云打码
        uid = yundama.login();
        print('uid: %s' % uid)

        # 查询余额
        balance = yundama.balance();
        print('balance: %s' % balance)

        # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
        cid, result = yundama.decode(response_content, codetype, timeout)
        print('cid: %s, result: %s' % (cid, result))

        return cid, result


def indetify_by_filepath(file_path):  # 打开注释
    if (username == 'username'):
        print('请设置好相关参数再测试')
    else:
        # 初始化
        yundama = YDMHttp(username, password, appId, appKey)

        # 登陆云打码
        uid = yundama.login();
        print('uid: %s' % uid)

        # 查询余额
        balance = yundama.balance();
        print('balance: %s' % balance)

        # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
        cid, result = yundama.decode(file_path, codetype, timeout)
        print('cid: %s, result: %s' % (cid, result))
        return result


######################################################################
if __name__ == '__main__':
    url = "http://qian.sicent.com/Login/code.do"
    content = requests.get(url).content
    with open("test.png", "wb") as f:
        f.write(content)
    indetify(content)


3. 导出到excel的类

import json

import xlsxwriter
from pymongo import MongoClient
from com.medchat.Settings import *


def export_data_2_excel(filename, data_list):
    """导出数据"""
    workbook = xlsxwriter.Workbook(filename)
    worksheet = workbook.add_worksheet()

    cols = ['省份', '医院名称', '医院等级', '医院信息', '等级信息', '备注']
    for col_index, col in enumerate(cols):
        worksheet.write(0, col_index, col)

    for row_index, data in enumerate(data_list, start=1):
        worksheet.write(row_index, 0, data['province_name'])
        worksheet.write(row_index, 1, data['hospital_name'])
        if data['cls'] is None:
            data['cls'] = ""
        worksheet.write(row_index, 2, data['cls'])

        if data['hospitals'] is None:
            data['hospitals'] = ""
        worksheet.write(row_index, 3, data['hospitals'])

        if data['clss'] is None:
            data['clss'] = ""
        worksheet.write(row_index, 4, data['clss'])

        if data['items'] is None:
            data['items'] = ""
        result = json.dumps(data['items'], ensure_ascii=False)
        result = result.replace("\"", "")
        worksheet.write(row_index, 5, result)

    workbook.close()


# 按照省份处理数据
PROVINCE_NAME = "广西壮族自治区"
PROVINCE_NAME_1 = "广西"

if __name__ == '__main__':

    client = MongoClient(host="192.168.1.27", port=27017)
    collection = client[MONGODB_MEDCHAT_DB_1][MONGODB_MEDCHAT_TABLE_1]
    ret = collection.find({"province_name": PROVINCE_NAME})
    items = []
    for item in ret:
        items.append(item)
        print(item)

    export_data_2_excel("{}.xlsx".format(PROVINCE_NAME), items)

4. 处理数据的相关逻辑代码

from pymongo import MongoClient
import re
import xlsxwriter

from com.medchat.Settings import *
from com.medchat.spider.OrganizationTypeSpider import OrganizationTypeSpider
import copy
import json
import time


class DataTask:
    def __init__(self):
        self.client = MongoClient(host="192.168.1.27", port=27017)
        self.collection = self.client[MONGODB_HOSPITAL_NAME][MONGODB_HOSPITAL_TABLE_NAME_5]
        self.data_list = []
        self.spider = OrganizationTypeSpider()
        # 测试开关
        self.flag = True

        self.ret = self.collection.aggregate(
            [{"$group": {
                "_id": {"province_name": "$province_name", "city_name": "$city_name", "county_name": "$county_name",
                        "hospital_name_pre": "$hospital_name_pre", "hospital_name": "$hospital_name",
                        "alias_name": "$alias_name",
                        "organization_type": "$organization_type", "note": "$note",
                        "relation_organization": "$relation_organization"}}},
                {"$project": {"province_name": "$_id.province_name", "city_name": "$_id.city_name",
                              "county_name": "$_id.county_name", "hospital_name_pre": "$_id.hospital_name_pre",
                              "hospital_name": "$_id.hospital_name",
                              "alias_name": "$_id.alias_name", "organization_type": "$_id.organization_type",
                              "note": "$_id.note", "relation_organization": "$_id.relation_organization",
                              "_id": 0}},
                {"$sort": {"province_name": 1, "city_name": 1, "county_name": 1}}
            ])

    def deal_with_alias_name(self, item):
        """处理医院别名"""
        ret = re.match(r".*(?P<name1>[((].*[))]).*", item['hospital_name_pre'])
        # 包含括号的长度2
        if ret is not None and len(ret.group(1)) > 3:
            item['alias_name'] = ret.group(1).replace("(", "").replace(")", "").replace("(", "").replace(")", "")

    def deal_with_hospital_name_post(self, item):
        """处理医院名称"""
        if '医院' not in item['hospital_name']:
            item['hospital_name_post'] = copy.deepcopy(item['hospital_name'])
            p_len = len(item['province_name'])
            c_len = len(item['city_name'])
            cc_len = len(item['county_name'])
            """
            思路:1. 比较三者的长度(核心)
                    1.1. 三者长度相等(替换顺序无所谓), 但是去掉省市县后还得替换一次
                    1.2. 有两个长度相等(是一个的长度长,还是两个相等的长)
                        1.2.1. 两个的长
                            1.2.1.1. 先替换两个相同的, 再替换短的, 但是去掉省市县后还得替换一次
                        1.2.2. 一个的长
                            1.2.2.1. 先替换长的,再替换两个相同的
                    1.3. 长度各不相同
                        依次替换长到短
            """
            if p_len == c_len == cc_len:
                item['hospital_name_post'] = item['hospital_name_post'].replace(item['province_name'], "").replace(
                    item['city_name'], "").replace(item['county_name'], "")
                item['hospital_name_post'] = item['hospital_name_post'].replace(item['province_name'][:-1], "").replace(
                    item['city_name'][:-1], "").replace(item['county_name'][:-1], "")
            elif p_len == c_len and p_len < cc_len:
                item['hospital_name_post'] = item['hospital_name_post'].replace(item['county_name'], "").replace(
                    item['city_name'], "").replace(item['province_name'], "")
                item['hospital_name_post'] = item['hospital_name_post'].replace(item['county_name'][:-1], "").replace(
                    item['city_name'][:-1], "").replace(item['province_name'][:-1], "")
            elif p_len == cc_len and p_len < c_len:
                item['hospital_name_post'] = item['hospital_name_post'].replace(item['city_name'], "").replace(
                    item['county_name'], "").replace(item['province_name'], "")
                item['hospital_name_post'] = item['hospital_name_post'].replace(item['city_name'][:-1], "").replace(
                    item['county_name'][:-1], "").replace(item['province_name'][:-1], "")
            elif cc_len == c_len and c_len < p_len:
                item['hospital_name_post'] = item['hospital_name_post'].replace(item['province_name'], "").replace(
                    item['county_name'], "").replace(item['city_name'], "")
                item['hospital_name_post'] = item['hospital_name_post'].replace(item['province_name'][:-1], "").replace(
                    item['county_name'][:-1], "").replace(item['city_name'][:-1], "")
            elif p_len > cc_len > c_len:
                item['hospital_name_post'] = item['hospital_name_post'].replace(item['province_name'], "").replace(
                    item['county_name'], "").replace(item['city_name'], "")
                item['hospital_name_post'] = item['hospital_name_post'].replace(item['province_name'][:-1], "").replace(
                    item['county_name'][:-1], "").replace(item['city_name'][:-1], "")
            elif c_len > p_len > cc_len:
                item['hospital_name_post'] = item['hospital_name_post'].replace(item['city_name'], "").replace(
                   
评论 10
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值