'''
Created on 2019年12月23日
@author: Zhangzhiwei
'''
import datetime
import threading
import cx_Oracle
import urllib.request
import uuid
import pyamf
from pyamf import remoting
from pyamf.flex import messaging
import os
os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
class HqPara:
def __init__(self, breedInfo, provice, breedInfoDl, marketInfo):
self.marketInfo = marketInfo
self.breedInfoDl = breedInfoDl
self.breedInfo = breedInfo
self.provice = provice
class BreedInfoPo():
def __init__(self, children0, item_code, item_name, children1):
self.parentcode = None
self._children = children0
self.itemcode = item_code # String 000000
self.itemname = item_name # String 全部
self.children = children1
class ProvincePo():
def __init__(self, item_code, item_name):
self._children = None
self.itemcode = item_code # String 000000
self.itemname = item_name # String 全国
self.children = None
class BreedInfo():
def __init__(self, item_code, item_name):
self.parentcode = None
self._children = None
self.itemcode = item_code # String AM
self.itemname = item_name # String 水产品
self.children = None
class PMarketInfo():
def __init__(self, market_code, market_name):
self.marketCode = market_code
self.marketName = market_name # String 000000/AM
# registerClassAlias("personTypeAlias", Person);
# 注册自定义的Body参数类型,这样数据类型com.itown.kas.pfsc.report.po.HqPara就会在后面被一并发给服务端(否则服务端就可能返回参数不是预期的异常Client.Message.Deserialize.InvalidType)
pyamf.register_class(HqPara, alias='com.itown.kas.pfsc.report.po.HqPara')
pyamf.register_class(BreedInfoPo, alias='com.itown.kas.pfsc.report.po.BreedInfoPo')
pyamf.register_class(ProvincePo, alias='com.itown.kas.pfsc.report.po.ProvincePo')
pyamf.register_class(BreedInfo, alias='com.itown.kas.pfsc.report.po.BreedInfoPo')
pyamf.register_class(PMarketInfo, alias='com.itown.kas.pfsc.report.po.PMarketInfo')
def construct_request(product_code, product_name, provice_code, provice_name, breed_code, breed_name, market_code, market_name, page_num, total_num):
breedInfo = BreedInfoPo(None, product_code, product_name, None)
provice = ProvincePo(provice_code, provice_name)
breedInfoDl = BreedInfo(breed_code, breed_name)
marketInfo = PMarketInfo(market_code, market_name)
# 构造flex.messaging.messages.RemotingMessage消息
msg = messaging.RemotingMessage(messageId=str(uuid.uuid1()).upper(),
clientId=str(uuid.uuid1()).upper(),
operation='getHqSearchData',
destination='reportStatService',
timeToLive=0,
timestamp=0)
# 第一个是查询参数,第二个是页数,第三个是控制每页显示的数量(默认每页只显示15条)但爬取的时候可以一下子设置成全部的数量
# 构造请求数据
msg.body = [HqPara(breedInfo, provice, breedInfoDl, marketInfo), str(page_num), str(total_num)]
print (str(page_num))
print (str(total_num))
msg.headers['DSEndpoint'] = None
msg.headers['DSId'] = str(uuid.uuid1()).upper()
# 按AMF协议编码数据
req = remoting.Request('null', body=(msg,))
env = remoting.Envelope(amfVersion=pyamf.AMF3)
env.bodies = [('/1', req)]
data = bytes(remoting.encode(env).read())
return data
# 返回一个请求的数据格式
def getResponse(data):
http_handler = urllib.request.HTTPHandler()
url = 'http://jgsb.agri.cn/messagebroker/amf'
req = urllib.request.Request(url, data, headers={'Content-Type': 'application/x-amf'})
# 解析返回数据
opener = urllib.request.build_opener(http_handler)
return opener.open(req).read()
def getContent(response):
amf_parse_info = remoting.decode(response)
# 数据总条数
total_num = amf_parse_info.bodies[0][1].body.body[3]
info = amf_parse_info.bodies[0][1].body.body[0]
print (info)
return total_num, info
def func():
conn = cx_Oracle.connect('username/password@127.0.0.1:8082/ORCL')
print("数据库连接上了")
cursor_oracle = conn.cursor()
num = 0;
b = '元/公斤';
# 获取数据量
reqData = construct_request('000000', '全部', '370000', '山东省', '000000', '全部', '3707056', '寿光物流园', 1, 2)
rep = getResponse(reqData)
total_num, info = getContent(rep)
# 一次请求完成
reqData = construct_request('000000', '全部', '370000', '山东省', '000000', '全部', '3707056', '寿光物流园', 1, total_num)
rep = getResponse(reqData)
total_num, info = getContent(rep)
for record in info:
print (record)
sql = "insert into PRODUCTPRICE(ID,NAME,PRICE,UNIT,AREA,INDATE) values ('" + str(uuid.uuid1()) + "','" + record["farmProduceName"] + "','" + str(record["averagePrice"]) + "','" + b + "','" + record["marketName"] + "','" + str(record["reportDate"]) + "')"
print(sql)
cursor_oracle.execute(sql)
num = num + 1
if(num > 100):
conn.commit()
num = 0
conn.commit()
cursor_oracle.close()
conn.close()
timer = threading.Timer(86400, func)
timer.start()
# 获取现在时间
now_time = datetime.datetime.now()
# 获取明天时间
next_time = now_time + datetime.timedelta(days=+1)
next_year = next_time.date().year
next_month = next_time.date().month
next_day = next_time.date().day
# 获取明天3点时间
next_time = datetime.datetime.strptime(str(next_year)+"-"+str(next_month)+"-"+str(next_day)+" 03:00:00", "%Y-%m-%d %H:%M:%S")
# # 获取昨天时间
# last_time = now_time + datetime.timedelta(days=-1)
# 获取距离明天3点时间,单位为秒
timer_start_time = (next_time - now_time).total_seconds()
print(timer_start_time)
#定时器,参数为(多少时间后执行,单位为秒,执行的方法)
timer = threading.Timer(10, func)
timer.start()
Python爬虫脚本,利用Beautifulfly爬取动态网页网页(源码)
最新推荐文章于 2024-07-28 17:04:36 发布