from faker import Faker
import os, random, gzip
import re, time
from pypinyin import pinyin, Style
from datetime import datetime, timedelta
from concurrent.futures.thread import ThreadPoolExecutor
faker = Faker('zh_CN')
def dir_exists(dir_path):
"""
判断日期目录是否存在
:return:
"""
if not os.path.exists(dir_path):
os.makedirs(dir_path)
return dir_path
def three_num():
data_list = [x for x in range(10)]
data = random.sample(data_list, 3)
return str(data[0]) + str(data[1]) + str(data[2])
def get_idd_code():
"""
随机选择国际长途电话区号
(非0开头,两位或者三位)
:return:
"""
no_zero = [x for x in range(1, 10)]
zero = [x for x in range(10)]
num = random.randint(0, 1)
if num == 0: # 生产两位国际区号
a = random.sample(no_zero, 2)
return str(a[0]) + str(a[1])
else: # 生产三位国际区号
a = random.choice(no_zero)
b = random.sample(zero, 2)
return str(a) + str(b[0]) + str(b[1])
def gz_phone_file(file_path):
"""
压缩个人电话信息数据文件
:param file_path:
:return:
"""
gz_phone_file = re.findall(r'(.+?)\.', file_path)[0] + ".gz"
with gzip.open(gz_phone_file, 'wb') as f_w:
with open(file_path, "r", encoding="utf-8") as f_r:
for line in f_r:
f_w.write(bytes(line, encoding='utf-8'))
f_r.close()
f_w.close()
def get_homecif():
"""
构造非0开头的两个数(注册CIF-S编码)
:return:
"""
no_zero = [x for x in range(1, 10)]
zero = [x for x in range(10)]
a = random.sample(no_zero, 1)
b = random.sample(zero, 1)
return str(a[0]) + str(b[0])
def get_descriminator():
"""
构造分类鉴别标识
:return:
"""
all_zero = [0] * 64
num = random.randint(1, 10)
for i in range(num):
pos = random.randint(0, 63)
all_zero[pos] = 1
return "".join([str(x) for x in all_zero])
def get_LastTime(Date_id):
"""
构造最后访问时间和最后维护时间
:return:
"""
month = random.randint(10, 60)
LastAccessTime = (datetime.strptime(str(Date_id), '%Y%m%d') + timedelta(days=(month - 5) * 30)).strftime(
'%Y%m%d%H%M%S') # 最后访问时间
LastMntTime = (datetime.strptime(str(Date_id) + str(datetime.now().strftime('%Y%m%d %H%M%S')).split()[1],
'%Y%m%d%H%M%S') + timedelta(days=month * 30)).strftime(
'%Y%m%d%H%M%S') # 最后维护时间
return LastAccessTime, LastMntTime
def get_pid(i):
"""
构造pid
:return:
"""
pid_prex = '16101'
pid_last = str(i)
pid_midd = "".join([str(0) for x in range(16 - len(pid_prex) - len(pid_last))])
pid = "".join([pid_prex, pid_midd, pid_last])
return pid
def write_pre_phone(args):
"""
个人电话信息数据写入文件
:param f_w:
:param f_w_pid:
:param phone_count:
:return:
"""
f_w_phone, count, Date_Id, seq = args[0], args[1], args[2], args[3]
for i in range(count[0], count[1] + 1):
Pid = get_pid(i) # PID
SerialNo = random.randint(0, 1000000) # 序号
IddPrefix = get_idd_code() # 国际长途电话区号
MobileTel = faker.phone_number() # 手机号码
line_phone = "|!".join([str(Pid), str(SerialNo), str(IddPrefix), str(MobileTel)]) + "\n"
f_w_phone.write(line_phone)
def write_pre_base(args):
f_w_base, count, Date_Id, seq = args[0], args[1], args[2], args[3]
for i in range(count[0], count[1] + 1):
Pid = get_pid(i) # PID
HomeCif = get_homecif() # 注册CIF-S编码
CnName = faker.name() # 姓名
PyName = "".join([x[0] for x in pinyin(CnName, style=Style.NORMAL)]).upper() # 汉语拼音姓名
GenderCode = random.randint(1, 2) # 性别
Discriminator = get_descriminator() # 分类鉴别标识
BuildingDate = Date_Id # 建立日期
LastAccessTime, LastMntTime = get_LastTime(Date_Id)
RecMac = "" # 记录Mac
line_base = "{seq}".format(seq=seq).join(
[str(Pid), str(HomeCif), str(CnName), str(PyName), str(GenderCode), str(Discriminator),
str(BuildingDate), str(LastAccessTime), str(LastMntTime), RecMac]) + "\n"
f_w_base.write(line_base)
def get_thread_count(count):
"""
获取线程数量
:param phone_count:
:return:
"""
size = 2000 # 每个线程模拟生成2000条数据
thread_count = int(count / size) + 1
count_list = []
for i in range(1, thread_count):
count_list.append((size * (i - 1) + 1, size * i)) # 获取多线程数据条数区间范围
if i == thread_count - 1:
if (count - i * size) == 0:
thread_count = thread_count - 1
else:
count_list.append((size * i + 1, count))
print("开启线程数{0}个".format(thread_count))
return thread_count, count_list
def create_pre_phone_base(Area_code, Date_Id, count, seq='|!'):
"""
多线程虚拟生成个人移动电话表数据和个人基本信息数据
pid:Pid号
serino:序号
iddpre:国际长途区号
phone_number:电话号码
:param Date_Id: 日期
:param count: 数据条数
:return:
"""
# 个人移动电话表
phone_file = "{Area_code}-ABIS-D_P2MOBILE-3G-{Date_Id}.txt".format(Area_code=Area_code, Date_Id=Date_Id)
# 个人基本信息表
base_file = "{Area_code}-ABIS-D_PBAS-3G-{Date_Id}.txt".format(Area_code=Area_code, Date_Id=Date_Id)
# cwd = os.getcwd()
cwd = '/home/appuser/out_data/simula_data'
# dir_path = (cwd + '/data/{date}'.format(date=str(Date_Id)[0:6])).replace('\\', '/')
dir_path = os.path.join(cwd, '{date}'.format(date=str(Date_Id)))
dir_path = dir_exists(dir_path)
phone_file_path = os.path.join(dir_path, phone_file)
base_file_path = os.path.join(dir_path, base_file)
f_w_phone = open(phone_file_path, "a+", encoding="utf-8") # 存放个人移动电话信息
f_w_base = open(base_file_path, "a+", encoding="utf-8") # 存放个人基本信息
thread_count, count_list = get_thread_count(count)
p = ThreadPoolExecutor(thread_count)
for count in count_list:
p.submit(write_pre_phone, args=(f_w_phone, count, Date_Id, seq))
p.submit(write_pre_base, args=(f_w_base, count, Date_Id, seq))
p.shutdown(True)
f_w_phone.close()
f_w_base.close()
# 压缩文件为gz格式
# print("开始压缩个人移动电话模拟数据和个人基本信息数据")
# p1 = ThreadPoolExecutor(2)
# for file in [phone_file_path, base_file_path]:
# p1.submit(gz_phone_file(file))
# p1.shutdown(True)
if __name__ == '__main__':
start = time.time()
# Date_Id = datetime.now().strftime('%Y%m%d')
Date_Id = 20191213
count = 100000
seq = '|!' # 分隔符
Area_code = 44 # 省份代码
# 开始生成个人移动电话模拟数据和个人基本信息数据
print("开始生成个人移动电话模拟数据和个人基本信息数据")
create_pre_phone_base(Area_code, Date_Id, count, seq)
end = time.time()
print("模拟生成{count}条数据运行时间:{time_}".format(count=count, time_=(end - start)))