import gzip
import os
import random
import re
import time
from concurrent.futures.thread import ThreadPoolExecutor
from datetime import datetime, timedelta
from faker import Faker
faker = Faker('zh_CN')
org_name_dict = {} # 存放机构号对应机构名称字典
org_name_temp_dict = {}
provice_code_dict = {} # 省市代码字典
# provice_code = {
# "北京市": "01",
# "上海市": "02",
# "天津市": "03",
# "重庆市": "04",
# "黑龙江省": "05",
# "吉林省": "06",
# "辽宁省": "07",
# "内蒙古自治区": "08",
# "河北省": "09",
# "新疆维吾尔自治区": "10",
# "甘肃省": "11",
# "青海省": "12",
# "陕西省": "13",
# "宁夏回族自治区": "14",
# "河南省": "15",
# "山东省": "16",
# "山西省": "17",
# "安徽省": "18",
# "湖北省": "19",
# "湖南省": "20",
# "江苏省": "21",
# "四川省": "22",
# "贵州省": "23",
# "云南省": "24",
# "广西壮族自治区": "25",
# "西藏自治区": "26",
# "浙江省": "27",
# "江西省": "28",
# "广东省": "29",
# "福建省": "30",
# "台湾省": "31",
# "海南省": "32",
# "香港特别行政区": "33",
# "澳门特别行政区": "34"
# }
def dir_exists(dir_path):
"""
判断日期目录是否存在
:return:
"""
if not os.path.exists(dir_path):
os.makedirs(dir_path)
return dir_path
def gz_file(file_path):
"""
压缩数据文件
:param file_path:
:return:
"""
gz_phone_file = re.findall(r'(.+?)\.', file_path)[0] + ".gz"
with gzip.open(gz_phone_file, 'wb') as f_w:
with open(file_path, "r", encoding="utf-8") as f_r:
for line in f_r:
f_w.write(bytes(line, encoding='utf-8'))
f_r.close()
f_w.close()
def get_cust_no(i):
"""
构造客户号
:return:
"""
cust_no_prex = '16101'
cust_no_last = str(i)
cust_no_midd = "".join([str(0) for x in range(16 - len(cust_no_prex) - len(cust_no_last))])
cust_no = "".join([cust_no_prex, cust_no_midd, cust_no_last])
return cust_no
def get_mng_org_no(i, Area_code):
"""
构造管理机构号
定义44开头,长度为20
:param i:
:return:
"""
mng_no_pre = Area_code # "44"
mng_no_last = str(i).zfill(18) # 管理机构号的总长度为20
return mng_no_pre + mng_no_last
def get_term_no(Area_code):
"""
营销团队编号
以44开头,长度为6,中间补0,最后两位为两位数
:return:
"""
term_no_pref = Area_code # "44"
data = [x for x in range(10)]
term_no_last = "".join([str(x) for x in random.sample(data, 2)]).zfill(4)
return term_no_pref + term_no_last
def get_term_org():
"""
营销团队机构
以44开头,长度为10,中间补0,从101到200为营销团队机构号
:return:
"""
term_org_pref = "44"
data = [x for x in range(101, 201)]
term_no_last = str(random.sample(data, 1)[0]).zfill(8)
return term_org_pref + term_no_last
def get_id_card(i, Area_code):
"""
管理客户经理号 即身份证号
:return:
"""
card_pref = "{area_code}0332".format(area_code=Area_code) # "440332"
card_last = str(i).zfill(12)
return card_pref + card_last
def get_asign_time():
"""
指派时间
:return:
"""
hour = str(random.randint(0, 23)).zfill(2)
minu = str(random.randint(0, 59)).zfill(2)
secd = str(random.randint(0, 59)).zfill(2)
return ":".join([hour, minu, secd])
def get_id_card2(mng_id_card, i, count):
"""
指派人员工号(身份证号,不能与客户经理号相同)
:param i:
:param count:
:return:
"""
while True:
random_num = random.randint(count[0], count[1])
if random_num != i:
return mng_id_card[: -len(str(count[0]))] + str(random.randint(count[0] + 1, count[1]))
def get_pro_code(org_name):
"""
省市代码
:return:
"""
PATTERN = r'([\u4e00-\u9fa5]{2,5}?(?:省|自治区|行政区|市))([\u4e00-\u9fa5]{1,5}?(?:市)){0,1}'
pattern = re.compile(PATTERN)
m = pattern.search(org_name)
temp_list = []
for a in m.groups():
if a is not None:
temp_list.append(a)
name = "".join(temp_list)
code = provice_code_dict.get(name)
if code:
pass
else:
while True:
code2 = str(random.randint(1, 99)).zfill(2)
if code != code2:
provice_code_dict[org_name] = code2
code = code2
break
return code
def get_city():
"""
模拟市
:return:
"""
while True: # 市
city = faker.city()
if re.findall(r'(.+?)市', city):
return city
def get_country():
"""
模拟县
:return:
"""
while True: # 县
country = faker.city()
if re.findall(r'(.+?)县', country):
return country
def get_org_name(org_no):
"""
机构名称
:return:
"""
while True:
province = faker.province()
street = faker.street_name()
if "市" in province or "行政区" in province:
country = get_country()
org_name = province + country + street + "支行"
else:
city = get_city()
country = get_country()
org_name = province + city + country + street + "支行"
PATTERN = r'([\u4e00-\u9fa5]{2,5}?(?:省|自治区|行政区|市))([\u4e00-\u9fa5]{1,7}?(?:市)){0,1}([\u4e00-\u9fa5]{1,7}?(?:区|县)){0,1}'
pattern = re.compile(PATTERN)
m = pattern.search(org_name)
temp_list = []
for val in m.groups():
if val is None:
continue
temp_list.append(val)
line = "".join(temp_list)
org_name = line + street + "支行"
value = org_name_temp_dict.get(org_name)
if value == 1:
continue
org_name_temp_dict[org_name] = 1
org_name_dict[org_no] = org_name
return org_name
def get_parnt_org_name(org_name):
"""
上级机构名称
:param org_name:
:return:
"""
PATTERN = r'([\u4e00-\u9fa5]{2,5}?(?:省|自治区|行政区|市))([\u4e00-\u9fa5]{1,5}?(?:市)){0,1}'
pattern = re.compile(PATTERN)
m = pattern.search(org_name)
temp_list = []
for aa in m.groups():
if aa is not None:
temp_list.append(aa)
line = "".join(temp_list)
org_name = line + "支行"
return org_name
def get_org_no_name(i, Area_code):
"""
获取机构号对应机构名称
:return:
"""
while True:
org_no = get_mng_org_no(i, Area_code)
org_name = org_name_dict.get(org_no)
if org_name is not None:
return org_no, org_name
def get_boe_org_no():
"""
BoEing机构号
:return:
"""
boe_org_no_pref = "190"
data = [x for x in range(10)]
boe_org_no_last = "".join([str(x) for x in random.sample(data, 3)])
return boe_org_no_pref + boe_org_no_last
def get_boe_org_name():
"""
模拟BoEing机构名称
:return:
"""
bank_list = ['人民', '建设', '农业', '邮政']
street = faker.street_name()
return "中国{bank}银行股份有限公司{street}支行".format(bank=random.choice(bank_list), street=street)
def get_offc_pho():
"""
模拟电话号码
:return:
"""
offc_pho_pref = "886"
data = [x for x in range(10)]
offc_pho_last = "".join([str(x) for x in random.sample(data, 5)])
return offc_pho_pref + offc_pho_last
def get_prfn():
"""
职务
:return:
"""
prfn_list = ["行长", "副行长", "科员", "部门经理", "部门副经理"]
return random.choice(prfn_list)
def get_prfn_lvl(prfn):
"""
职级
:return:
"""
prfn_lvl_dict = {"部门经理": "科级", "部门副经理": "副科级", "行长": "处级", "副行长": "副处级", "科员": "科员"}
return prfn_lvl_dict.get(prfn)
def get_post(prfn):
"""
岗位
:return:
"""
post_dict = {"行长": "一级支行行长", "副行长": "一级支行副行长",
"科员": "大堂经理", "部门经理": "部门经理", "部门副经理": "部门副经理"}
return post_dict.get(prfn)
def write_cmm_org_map_boe(args):
"""
CMM与BoEing机构映射表
:param args:
:return:
"""
f_w_cmm_org_map_boe, count, Date_Id, seq, Area_code = args[0], args[1], args[2], args[3], args[4]
for i in range(count[0], count[1] + 1):
org_no, org_name = get_org_no_name(i, Area_code) # 机构号,机构名称
boe_org_no = get_boe_org_no() # BoEing机构号
boe_org_name = get_boe_org_name() # BoEing机构名称
pro_code = get_pro_code(org_name) # 省市代码
line = "{seq}".format(seq=seq).join([org_no, org_name, boe_org_no, boe_org_name, pro_code]) + "\n"
f_w_cmm_org_map_boe.write(line)
def write_cmm_staf_info(args):
"""
CMM员工信息表
:param args:
:return:
"""
f_w_cmm_staf, count, Date_Id, seq, Area_code = args[0], args[1], args[2], args[3], args[4]
for i in range(count[0], count[1] + 1):
id_card_no = get_id_card(i, Area_code) # 身份证号
staf_no = id_card_no # 员工号
staf_name = faker.name() # 员工姓名
reg_org_no = get_term_org() # 注册机构号
staf_stat = str(random.randint(0, 1)) # 员工状态
offc_pho = get_offc_pho() # 办公电话
mob_no = str(faker.phone_number()) # 手机号码
eml = faker.email() # 邮箱
prfn = get_prfn() # 职务
prfn_lvl = get_prfn_lvl(prfn) # 职级
post = get_post(prfn) # 岗位
extr_fld = "" # 备用字段
line = "{seq}".format(seq=seq).join([
id_card_no, staf_no, staf_name, reg_org_no, staf_stat,
offc_pho, mob_no, eml, prfn, prfn_lvl, post, extr_fld
]) + "\n"
f_w_cmm_staf.write(line)
def write_cmm_cust_mn(args):
"""
CMM客户管理表
:param args:
:return:
"""
f_w_cust_mn, count, Date_Id, seq, Area_code = args[0], args[1], args[2], args[3], args[4]
for i in range(count[0], count[1] + 1):
cust_no = get_cust_no(i) # 客户号
cust_nam = faker.company() # 客户名称
mng_org_no = get_mng_org_no(i, Area_code) # 管理机构号
term_no = get_term_no(Area_code) # 营销团队编号
term_org = get_term_org() # 营销团队机构
mng_id_card = get_id_card(i, Area_code) # 管理客户经理号
mng_type = str(random.randint(1, 2)).zfill(2) # 客户经理管理类型
term_mng_type = str(random.randint(1, 2)).zfill(2) # 营销团队管理类型
asign_date = faker.date(pattern='%Y%m%d', end_datetime=datetime.now()) # 指派日期
asign_time = get_asign_time() # 指派时间
asign_id_card = get_id_card2(mng_id_card, i, count) # 指派人员工号
asgin_org_no = "4415" + str(random.randint(0, 1000)).zfill(6) # 指派人机号
asgin_rol_no = str(random.sample(
[2005, 2003, 3005, 3008, 5001, 5003, 5008, 5001, 5002, 2001, 2008, 3001, 3002, 3003], 1)[0]) # 指派人角色号
line = "{seq}".format(seq=seq).join(
[cust_no, cust_nam, mng_org_no, term_no, term_org, mng_id_card, mng_type, term_mng_type, asign_date,
asign_time, asign_id_card, asgin_org_no, asgin_rol_no]) + "\n"
f_w_cust_mn.write(line)
def write_cmm_org(args):
"""
CMM机构信息表
:param args:
:return:
"""
f_w_cmm_org, count, Date_Id, seq, Area_code = args[0], args[1], args[2], args[3], args[4]
print(count)
for i in range(count[0], count[1] + 1):
org_no = get_mng_org_no(i, Area_code) # 机构号
org_name = get_org_name(org_no) # 机构名称
pro_cod = get_pro_code(org_name) # 省市代码
org_typ = str(random.randint(1, 2)) # 机构性质
org_sts = str(random.randint(0, 2)) # 机构状态
org_lvl = str(random.randint(1, 6)) # 机构级别
parnt_org_no = get_mng_org_no(i, Area_code) # 上级机构号
parnt_org_nam = get_parnt_org_name(org_name) # 上级机构名称
msk_cod = "" # 机构掩码
dis_ord = str(random.randint(1, 99)) # 机构顺序
stop_rsn = "" # 停用原因
line = "|!".join(
[org_no, org_name, pro_cod, org_typ, org_sts, org_sts, org_lvl,
parnt_org_no, parnt_org_nam, msk_cod, dis_ord, stop_rsn]) + "\n"
f_w_cmm_org.write(line)
def get_thread_count(count):
"""
获取线程数量
:param phone_count:
:return:
"""
size = 2000 # 每个线程模拟生成2000条数据
thread_count = int(count / size) + 1
count_list = []
for i in range(1, thread_count):
count_list.append((size * (i - 1) + 1, size * i)) # 获取多线程数据条数区间范围
if i == thread_count - 1:
if (count - i * size) == 0:
thread_count = thread_count - 1
else:
count_list.append((size * i + 1, count))
print("开启线程数{0}个".format(thread_count))
return thread_count, count_list
def from_txt_to_gz_file(file_list):
"""
压缩文件为gz后缀
:param file_list:
:return:
"""
# 压缩文件为gz格式
print("开始压缩个人移动电话模拟数据和个人基本信息数据")
p1 = ThreadPoolExecutor(len(file_list))
for file in file_list:
p1.submit(gz_file(file))
p1.shutdown(True)
def create_cmm(Area_code, Date_Id, count, seq='|!'):
"""
多线程虚拟生成
1、CMM客户管理表
2、CMM机构信息表
3、CMM与BoEing机构映射表
4、CMM员工信息表
pid:Pid号
serino:序号
iddpre:国际长途区号
phone_number:电话号码
:param Date_Id: 日期
:param count: 数据条数
:return:
"""
# CMM客户管理表
cmm_cust_mn_file = "{Area_code}-CMM-OUT_CMM_A_CUST_MN_HOST-2G-{Date_Id}.txt".format(Area_code=Area_code,
Date_Id=Date_Id)
# CMM机构信息表
cmm_org_file = "00-CMM-CMM_A_ORG-2G-{Date_Id}.txt".format(Date_Id=Date_Id)
# CMM与BoEing机构映射表
cmm_org_map_boe_file = "00-CMM-CMM_A_ORG_MAP_BOE-2G-{Date_Id}.txt".format(Date_Id=Date_Id)
# CMM员工信息表
cmm_staf_info_file = "00-CMM-CMM_A_STAF_INFO-2G-{Date_Id}.txt".format(Date_Id=Date_Id)
file_list = []
# cwd = os.getcwd()
cwd = '/home/appuser/out_data/simula_data'
# print(Date_Id)
# dir_path = (cwd + '/data/{date}'.format(date=str(Date_Id)[0:8])).replace('\\', '/')
# print(dir_path)
dir_path = os.path.join(cwd, '{date}'.format(date=str(Date_Id)))
dir_path = dir_exists(dir_path)
cmm_cust_mn_file_path = os.path.join(dir_path, cmm_cust_mn_file)
cmm_org_file_path = os.path.join(dir_path, cmm_org_file)
cmm_org_map_boe_file_path = os.path.join(dir_path, cmm_org_map_boe_file)
cmm_staf_info_file_path = os.path.join(dir_path, cmm_staf_info_file)
file_list.append(cmm_cust_mn_file_path)
file_list.append(cmm_org_file_path)
file_list.append(cmm_org_map_boe_file_path)
file_list.append(cmm_staf_info_file_path)
f_w_cust_mn = open(cmm_cust_mn_file_path, "a+", encoding="utf-8") # CMM客户管理表
f_w_cmm_org = open(cmm_org_file_path, "a+", encoding="utf-8") # CMM机构信息表
f_w_cmm_map_boe = open(cmm_org_map_boe_file_path, "a+", encoding="utf-8") # CMM与BoEing机构映射表
f_w_cmm_staf = open(cmm_staf_info_file_path, "a+", encoding="utf-8") # CMM员工信息表
thread_count, count_list = get_thread_count(count)
p = ThreadPoolExecutor(thread_count)
for count in count_list:
p.submit(write_cmm_cust_mn, args=(f_w_cust_mn, count, Date_Id, seq, Area_code))
p.submit(write_cmm_org, args=(f_w_cmm_org, count, Date_Id, seq, Area_code))
p.shutdown(True)
p2 = ThreadPoolExecutor(thread_count)
for count in count_list:
p2.submit(write_cmm_org_map_boe, args=(f_w_cmm_map_boe, count, Date_Id, seq, Area_code))
p2.submit(write_cmm_staf_info, args=(f_w_cmm_staf, count, Date_Id, seq, Area_code))
p2.shutdown(True)
f_w_cust_mn.close()
f_w_cmm_org.close()
f_w_cmm_map_boe.close()
f_w_cmm_staf.close()
from_txt_to_gz_file(file_list) # txt格式压缩gz格式
if __name__ == '__main__':
start = time.time()
# Date_Id = datetime.now().strftime('%Y%m%d')
Date_Id = 20191223
count = 10000
seq = '|!' # 分隔符
Area_code = "44" # 省份代码
# 开始生成个人移动电话模拟数据和个人基本信息数据
print("开始生成CMM客户信息数据")
create_cmm(str(Area_code), Date_Id, count, seq)
end = time.time()
print("模拟生成{count}条数据运行时间:{time_}".format(count=count, time_=(end - start)))