import importlib
import sys
import pymysql
importlib.reload(sys)
import requests
import lxml.etree as etree
import os
class chinese_city():
def __init__(self):
self.baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'
self.base = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/'
self.conn = pymysql.connect(host="localhost", port=3306, user="root", passwd="123456", db="test", charset='utf8')
self.cur = self.conn.cursor()
self.trdic = {
1: '//tr[@class="provincetr"]',
2: '//tr[@class="citytr"]',
3: '//tr[@class="countytr"]',
4: '//tr[@class="towntr"]',
5: '//tr[@class="villagetr"]'
}
def __del__(self):
if self.cur:
self.cur.close()
if self.conn:
self.conn.close()
def crawl_page(self, url):
''' 爬行政区划代码公布页 '''
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}
i = 0
while i < 3:
try:
html = requests.get(url, headers=headers, timeout=20)
html.encoding = 'gbk'
text = html.text
return text
except requests.exceptions.RequestException<