目录
1. 背景:
项目中有一个前端显示词云的要求,前端使用ant design+echarts实现,后端文本拆分实现方案有很多,可以使用jieba库+java/python的方式实现,可以采用tensorflow_text深度学习框架实现,经过对比确定使用tensorflow_text+zh_segmentation实现
2. 功能实现
- 使用apscheduler框架,每天下午15:45执行
main.py
# -*- coding:utf-8 -*-
from apscheduler.schedulers.blocking import BlockingScheduler
from splitKeyworld import Split_key_world
if __name__=="__main__":
split_key_world=Split_key_world()
scheduler = BlockingScheduler()
scheduler.add_job(split_key_world.spilitKeyworld, 'cron', hour=15, minute=45, minute=45)
scheduler.start()
- 文本拆分
splitKeyworld.py
# -*- coding:utf-8 -*-
import re
import os
import itertools
import tensorflow as tf
from itertools import chain
from datetime import datetime
import tensorflow_text as text
from collections import Counter
from operator import itemgetter
import utils.mysqlUtil as mysqlUtil
class Split_key_world(object):
def __init__(self):
"""
初始化 需要配置环境变量
"""
def spilitKeyworld(self):
res2 = mysqlUtil.execute_select('select name from t_name')
# 将 pymysql 返回的元组数据转换为列表
projectNameList = list(chain.from_iterable(res2))
# 设定模型的 UR
MODEL_HANDLE = "https://hub.tensorflow.google.cn/google/zh_segmentation/1"
segmenter = text.HubModuleTokenizer(MODEL_HANDLE)
# 分割
tokens, starts, ends = segmenter.tokenize_with_offsets(projectNameList)
tokensList = tokens.to_list()
# 拼接数组函数
pinJieList = list(itertools.chain.from_iter
# counter之后获得的是字典{}
countList = Counter(pinJieList)
# 得到元组 list
sortList = sorted(countList.items(),key=itemgetter(1),reverse=True)
# 插入到数据库中
sql = ''
for item in sortList:
cloud_world = str(item[0].decode('utf-8'))
cloud_world = re.sub("[\r|\n|\\s!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~“”?,!【】()、。:;’‘……¥·]+", "", cloud_world)
# 去掉双斜线
cloud_world = eval(repr(cloud_world.replace('\\', '')))
# 对于单个汉字或3个英文单词的就去除
if len(item[0])>3:
sql = "insert into t_cloud_word(cloud_world,world_count)" \
" values ( '" + cloud_world + "','" + str(item[1]) + "' )"
mysqlUtil.execute_insert(sql)
print('时间:',datetime.now(),'生成关键字成功,一共:',len(sortList),'条记录')
- 导出安装的包生成requirements.txt
pip freeze > requirements.txt
- Dockerfile
FROM python:3.8-slim-buster
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
COPY requirements.txt .
RUN python -m pip install -r requirements.txt
WORKDIR /app
COPY . /app
RUN adduser -u 5678 --disabled-password --gecos "" appuser && chown -R appuser /app
USER appuser
CMD ["python", "main.py"]
3 部署
生成镜像
docker build -t test:latest .
运行
docker run -d test:latest