1.爬取过程需要增加代理ip,否则会出现验证码的情况。
2.当前没有增加失败次数的限制,线上跑了一次单进程的话一晚上也跑完了,大概61万的数据量。
3.cookie都是没有增加的,当一个新的ip访问的时候,基本上是不需要出验证码的。
4.在程序开始的时候增加了一个pass_list的列表,是从原表中查到哪些乡镇是爬取过的,之后不再爬取,以此来增加断点续爬的效率。
5.断点续爬也可以再增加省份、城市判断,但是需要将列表的最后一个删掉,因为可能是爬取某个城市的过程中出现了错误没有爬完。
6.页面可能会出现转码的问题,发现gbk 或者utf-8或者gb2312都不行,后来用了ISO-8859-1才可以。
# -*-coding:utf-8 -*-
import time
import requests
from lxml import etree
from conf.dbr import random_ip
from conf.conn import Pymsql_conn
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"Upgrade-Insecure-Requests": "1",
# "Referer": "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Host": "www.stats.gov.cn",
}
# 根据主页面解析省份及对应的URL之后将内容放到列表中返回
def parse_province(main_url):
"""
根据URL解析出来页面的省份信息及省份对应的URL地址
:param main_url:
:return:
"""
response = None
while 1:
ip = random_ip()
if ip:
proxy = {
"https": ip,
"http": ip,
}
try:
response = requests.get(url=main_url, headers=headers, proxies=proxy, timeout=1)
except Exception as e:
print(e)
if response:
# province_html = response.text.encode(encoding=response.encoding).decode('gbk')
province_html = response.text.encode(encoding=response.encoding).decode('gb2312')
tree = etree.HTML(province_html)
try:
result_list = tree.xpath('/html/body//tr[@class="provincetr"]/td/a')
# print(result_list)
province_list = []
for i in result_list:
province = i.xpath('./text()')[0]
province_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/' + i.xpath('./@href')[0]
print(province, province_url)
province_list.append((province, province_url))
# print(province_list)
if province_list:
return province_list
except Exception as e:
print(province_html)
print(e, '省份xpath解析失败')
else:
print('无ip睡眠3秒-province')
time.sleep(3)
# 传来的参数省份、省份对应的URL,之后返回当前省份页面内的城市信息
def parse_city(province, province_url):
"""
解析省份页面内容
:param province:
:param province_url:
:return:
"""
response = None
while 1:
ip = random_ip()
if ip:
proxy = {
"https": ip,
"http": ip,
}
try:
response = requests.get(url=province_url, headers=headers, proxies=proxy, timeout=1)
except Exception as e:
print(e)
if response:
try:
city_html = response.text.encode(encoding=response.encoding).decode('utf-8')
except Exception as e:
print(e)
try:
city_html = response.text.encode(encoding=response.encoding).decode('gbk')
except Exception as e:
print(e)
city_html = response.text.encode(encoding='ISO-8859-1')
tree = etree.HTML(city_html)
try:
result_list = tree.xpath('/html/body//tr[@class="citytr"]')
city_list = []
for i in result_list:
city_code = i.xpath('./td[1]/a/text()')[0]
city = i.xpath('./td[2]/a/text()')[0]
city_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/' + i.xpath('./td[2]/a/@href')[0]
print(province, city_code, city, city_url)
city_list.append({
"province": province,
"city_code": city_code,
"city": city,
"city_url": city_url,
})
# print(city_list)
if city_list:
return city_list
except Exception as e:
print(city_html)
print(e, '城市xpath解析失败')
else:
print('无ip睡眠3秒-city')
time.sleep(3)
# 通过城市解析区县
def parse_district(city_dict): # country
"""
通过传来的city_dict解析城市的区县
:param city_dict:
:return:
"""
province = city_dict.get('province')
city_code = city_dict.get('city_code')
city = city_dict.get('city')
city_url = city_dict.get('city_url')
# headers['Cookie'] = ''
response = None
while 1:
ip = random_ip()
if ip:
proxy = {
"https": ip,
"http": ip,
}
try:
response = requests.get(url=city_url, headers=headers, proxies=proxy, timeout=1)
except Exception as e:
print(e, '城市信息请求失败')
if response:
try:
city_html = response.text.encode(encoding=response.encoding).decode('gbk')
except Exception as e:
print(e)
try:
city_html = response.text.encode(encoding=response.encoding).decode('utf-8')
except Exception as e:
print(e)
city_html = response.text.encode(encoding='ISO-8859-1')
tree = etree.HTML(city_html)
try:
result_list = tree.xpath('/html/body//tr[@class="countytr"]')
district_list = []
for i in result_list:
district_code = i.xpath('./td[1]/a/text()')
# 判断是否为空
if district_code:
district_code = district_code[0]
district = i.xpath('./td[2]/a/text()')[0]
href = i.xpath('./td[2]/a/@href')[0]
num = href.split('/')[-1][0:2]
district_url = f'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/{num}/' + href
print(province, city_code, city, district_code, district, district_url)
district_list.append({
"province": province,
"city_code": city_code,
"city": city,
"district_code": district_code,
"district": district,
"district_url": district_url,
})
# print(district_list)
return district_list
except Exception as e:
print(city_html)
print(e, '区县解析失败')
else:
print('无ip睡眠3秒-district')
time.sleep(3)
# 解析城镇
def parse_town(district_dict):
"""
通过区县解析乡镇信息
:param district_dict:
:return:
"""
province = district_dict.get('province')
city_code = district_dict.get('city_code')
city = district_dict.get('city')
district_code = district_dict.get('district_code')
district = district_dict.get('district')
district_url = district_dict.get('district_url')
# headers['Cookie'] = ''
response = None
while 1:
ip = random_ip()
if ip:
proxy = {
"https": ip,
"http": ip,
}
try:
response = requests.get(url=district_url, headers=headers, proxies=proxy, timeout=1)
except Exception as e:
print(e, '乡镇信息请求失败')
if response:
try:
district_html = response.text.encode(encoding=response.encoding).decode('gbk')
except Exception as e:
print(e)
try:
district_html = response.text.encode(encoding=response.encoding).decode('utf-8')
except Exception as e:
print(e)
district_html = response.text.encode(encoding='ISO-8859-1')
tree = etree.HTML(district_html)
try:
result_list = tree.xpath('/html/body//tr[@class="towntr"]')
town_list = []
for i in result_list:
town_code = i.xpath('./td[1]/a/text()')[0]
town = i.xpath('./td[2]/a/text()')[0]
href = i.xpath('./td[2]/a/@href')[0]
num1 = href.split('/')[-1][0:2]
num2 = href.split('/')[-1][2:4]
town_url = f'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/{num1}/{num2}/' + href
print(town_code, town, town_url)
town_list.append({
"province": province,
"city_code": city_code,
"city": city,
"district_code": district_code,
"district": district,
"town_code": town_code,
"town": town,
"town_url": town_url,
})
# print(town_list)
return town_list
except Exception as e:
print(district_html)
print(e, '乡镇解析失败')
else:
print('无ip睡眠3秒-town')
time.sleep(3)
# 解析村委会居委会
def parse_village(town_dict):
"""
根据城镇解析村委会居委会
:param town_dict:
:return:
"""
province = town_dict.get('province')
city_code = town_dict.get('city_code')
city = town_dict.get('city')
district_code = town_dict.get('district_code')
district = town_dict.get('district')
town_code = town_dict.get('town_code')
town = town_dict.get('town')
town_url = town_dict.get('town_url')
# headers['Cookie'] = ''
response = None
while 1:
ip = random_ip()
if ip:
proxy = {
"https": ip,
"http": ip,
}
try:
response = requests.get(url=town_url, headers=headers, proxies=proxy, timeout=1)
except Exception as e:
print(e, '村委会居委会信息请求失败')
# print(response.text)
if response:
try:
village_html = response.text.encode(encoding=response.encoding).decode('gbk')
except Exception as e:
print(e)
try:
village_html = response.text.encode(encoding=response.encoding).decode('utf-8')
except Exception as e:
print(e)
# village_html = response.text.encode(encoding=response.encoding).decode('ISO-8859-1').encode('ISO-8859-1')
village_html = response.text.encode(encoding='ISO-8859-1')
tree = etree.HTML(village_html)
try:
result_list = tree.xpath('/html/body//tr[@class="villagetr"]')
village_list = []
for i in result_list:
village_code = i.xpath('./td[1]/text()')[0]
town_country_code = i.xpath('./td[2]/text()')[0]
village = i.xpath('./td[3]/text()')[0]
print(village_code, town_country_code, village)
village_list.append({
"province": province,
"city_code": city_code,
"city": city,
"district_code": district_code,
"district": district,
"town_code": town_code,
"town": town,
"village_code": village_code,
"village": village,
"town_country_code": town_country_code,
})
# print(village_list)
return village_list
except Exception as e:
print(village_html)
print(e, '村委会居委会解析失败')
else:
print('无ip睡眠3秒-village')
time.sleep(3)
# 将数据sql入库
def sql_in(task_list):
pm = Pymsql_conn()
for task in task_list:
province = task.get('province')
city_code = task.get('city_code')
city = task.get('city')
district_code = task.get('district_code')
district = task.get('district')
town_code = task.get('town_code')
town = task.get('town')
village_code = task.get('village_code')
village = task.get('village')
town_country_code = task.get('town_country_code')
val = (province,city_code,city,district_code,district,town_code,town,village_code,village,town_country_code)
# print(val)
insert_sql = f'insert into national_statistics_code(province,city_code,city,district_code,district,town_code,town,village_code,village,town_country_code) values {val}'
try:
pm.insert_info(sql=insert_sql)
except Exception as e:
print(e, insert_sql)
pm.commit_info()
pm.mysql_close()
# 程序开启
def server_run():
main_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html'
province_list = parse_province(main_url)
for i in province_list:
province, province_url = i[0], i[1]
if province not in ['北京市', '天津市']: # 此处是屏蔽一些省份一些爬过的省份就不再爬取了
city_list = parse_city(province, province_url)
print(province,'province')
for city in city_list:
print(city.get('city'), 'city')
if city.get('city') != '石家庄市':
district_list = parse_district(city)
for district in district_list:
town_list = parse_town(district)
for town in town_list:
print(province, city.get('city'), town.get('town'))
if town.get('town') not in pass_list:
vil_list = parse_village(town)
sql_in(vil_list) # sql入库
if __name__ == '__main__':
pms = Pymsql_conn()
pass_list = pms.check_info(sql='SELECT distinct(town) from national_statistics_code;')
pass_list = [i.get('town') for i in pass_list] # 获取下已爬取的位置 爬取过的不再爬取
print(pass_list)
pms.mysql_close()
server_run()
# main_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html'
# parse_province(main_url)
# province = town_dict.get('province')
# city_code = town_dict.get('city_code')
# city = town_dict.get('city')
# district_code = town_dict.get('district_code')
# district = town_dict.get('district')
# town_code = town_dict.get('town_code')
# town = town_dict.get('town')
# town_url = town_dict.get('town_url')
# town_dict = {
# "province":"河北省",
# "city_code":"130229000000",
# "city":"唐山市",
# "district_code":"130229201000",
# "district":"玉田县",
# "town_code":"130229203000",
# "town":"潮洛窝乡",
# "town_url":"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/13/02/29/130229203.html",
# }
# parse_village(town_dict)
3万+

被折叠的 条评论
为什么被折叠?



