离线文章画像
离线文章画像组成需求
- 文章:频道ID内容,关键词、主题词
- 主题词与关键词最大的区别就是主题词经过了规范化处理。
关键词:文章中一些词的权重高的。主题词:是进行规范化处理的,文章中出现的同义词,计算结果出现次数高的词。
关键词:TEXTRANK计算出的结果TOPK个词以及权重
主题词:TEXTRANK的TOPK词 与 ITFDF计算的TOPK个词的交集
步骤
- 原始文章表数据合并得到文章所有的词语句信息
文章标题+文章频道名称+文章内容组成文章完整内容 - 所有历史文章Tfidf计算
- 所有历史文章TextRank计算
原始文章表数据的合并
- 初始化spark信息配置
创建spark基类
from pyspark import SparkConf
from pyspark.sql import SparkSession
import os
class SparkSessionBase(object):
SPARK_APP_NAME = None
SPARK_URL = "yarn"
SPARK_EXECUTOR_MEMORY = "2g"
SPARK_EXECUTOR_CORES = 2
SPARK_EXECUTOR_INSTANCES = 2
ENABLE_HIVE_SUPPORT = False
def _create_spark_session(self):
conf = SparkConf() # 创建spark config对象
config = (
("spark.app.name", self.SPARK_APP_NAME), # 设置启动的spark的app名称,没有提供,将随机产生一个名称
("spark.executor.memory", self.SPARK_EXECUTOR_MEMORY), # 设置该app启动时占用的内存用量,默认2g
("spark.master", self.SPARK_URL), # spark master的地址
("spark.executor.cores", self.SPARK_EXECUTOR_CORES), # 设置spark executor使用的CPU核心数,默认是1核心
("spark.executor.instances", self.SPARK_EXECUTOR_INSTANCES)
)
conf.setAll(config)
# 利用config对象,创建spark session
if self.ENABLE_HIVE_SUPPORT:
return SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()
else:
return SparkSession.builder.config(conf=conf).getOrCreate()
之后在创建sparksession时可以继承这个基类
- 合并三张表内容,到一张表,写入到hive中
oa = OriginArticleData()
oa.spark.sql("use toutiao")
# 由于运行速度原因,选择一篇文章部分数据进行测试
basic_content = oa.spark.sql(
"select a.article_id, a.channel_id, a.title, b.content from news_article_basic a inner join news_article_content b on a.article_id=b.article_id where a.article_id=116636")
import pyspark.sql.functions as F
import gc
# 增加channel的名字,后面会使用
basic_content.registerTempTable("temparticle")
channel_basic_content = oa.spark.sql(
"select t.*, n.channel_name from temparticle t left join news_channel n on t.channel_id=n.channel_id")
# 利用concat_ws方法,将多列数据合并为一个长文本内容(频道,标题以及内容合并)
oa.spark.sql("use article")
sentence_df = channel_basic_content.select("article_id", "channel_id", "channel_name", "title", "content", \
F.concat_ws(
",",
channel_basic_content.channel_name,
channel_basic_content.title,
channel_basic_content.content
).alias("sentence")
)
del basic_content
del channel_basic_content
gc.collect()
# sentence_df.write.insertInto("article_data")
hive> select * from article_data limit 1;
- article数据库:存放文章计算结果
TFIDF 计算
读取N篇文章数据
文章数据进行分词处理,得到分词结果
TFIDF模型训练保存,spark使用count与idf进行计算
TFIDF计算方案:
先计算分词之后的每篇文章的词频,得到CV模型
然后根据词频计算IDF以及词,得到IDF模型
利用模型计算N篇文章数据的TFIDF值
import os
import sys
# 如果当前代码文件运行测试需要加入修改路径,避免出现后导包问题
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, os.path.join(BASE_DIR))
PYSPARK_PYTHON = "/miniconda2/envs/reco_sys/bin/python"
# 当存在多个版本时,不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON
from offline import SparkSessionBase
class KeywordsToTfidf(SparkSessionBase):
SPARK_APP_NAME = "keywordsByTFIDF"
SPARK_EXECUTOR_MEMORY = "7g"
ENABLE_HIVE_SUPPORT = True
def __init__(self):
self.spark = self._create_spark_session()
ktt = KeywordsToTfidf()
读取文章原始数据
ktt.spark.sql("use article")
article_dataframe = ktt.spark.sql("select * from article_data limit 20")
words_df = article_dataframe.rdd.mapPartitions(segmentation).toDF(["article_id", "channel_id", "words"])
分层
def segmentation(partition):
import os
import re
import jieba
import jieba.analyse
import jieba.posseg as pseg
import codecs
abspath = "/root/words"
# 结巴加载用户词典
userDict_path = os.path.join(abspath, "ITKeywords.txt")
jieba.load_userdict(userDict_path)
# 停用词文本
stopwords_path = os.path.join(abspath, "stopwords.txt")
def get_stopwords_list():
"""返回stopwords列表"""
stopwords_list = [i.strip()
for i in codecs.open(stopwords_path).readlines()]
return stopwords_list
# 所有的停用词列表
stopwords_list = get_stopwords_list()
# 分词
def cut_sentence(sentence):
"""对切割之后的词语进行过滤,去除停用词,保留名词,英文和自定义词库中的词,长度大于2的词"""
# print(sentence,"*"*100)
# eg:[pair('今天', 't'), pair('有', 'd'), pair('雾', 'n'), pair('霾', 'g')]
seg_list = pseg.lcut(sentence)
seg_list = [i for i in seg_list if i.flag not in stopwords_list]
filtered_words_list = []
for seg in seg_list:
# print(seg)
if len(seg.word) <= 1:
continue
elif seg.flag == "eng":
if len(seg.word) <= 2:
continue
else:
filtered_words_list.append(seg.word)
elif seg.flag.startswith("n"):
filtered_words_list.append(seg.word)
elif seg.flag in ["x", "eng"]: # 是自定一个词语或者是英文单词
filtered_words_list.append(seg.word)
return filtered_words_list
for row in partition:
sentence = re.sub("<.*?>", "", row.sentence) # 替换掉标签数据
words = cut_sentence(sentence)
yield row.article_id, row.channel_id, words
训练模型得到每个词的频率count结果
# 词语与词频统计
from pyspark.ml.feature import CountVectorizer
# 总词汇的大小,文本中必须出现的次数
cv = CountVectorizer(inputCol="words", outputCol="countFeatures", vocabSize=200*10000, minDF=1.0)
# 训练词频统计模型
cv_model = cv.fit(words_df)
cv_model.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/CV.model")
训练IDF模型
# 词语与词频统计
from pyspark.ml.feature import CountVectorizerModel
cv_model = CountVectorizerModel.load("hdfs://hadoop-master:9000/headlines/models/CV.model")
# 得出词频向量结果
cv_result = cv_model.transform(words_df)
# 训练IDF模型
from pyspark.ml.feature import IDF
idf = IDF(inputCol="countFeatures", outputCol="idfFeatures")
idfModel = idf.fit(cv_result)
idfModel.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/IDF.model")
HDFSTFI建立DF画像表
CREATE TABLE idf_keywords_values(
keyword STRING comment "article_id",
idf DOUBLE comment "idf",
index INT comment "index");
读取cvmodel 和IDFmodel
from pyspark.ml.feature import CountVectorizerModel
cv_model = CountVectorizerModel.load("hdfs://hadoop-master:9000/headlines/models/countVectorizerOfArticleWords.model")
from pyspark.ml.feature import IDFModel
idf_model = IDFModel.load("hdfs://hadoop-master:9000/headlines/models/IDFOfArticleWords.model")
将vocabulary IDF值 和对应的index保存在idf_keywords_values这个表中
#keywords_list_with_idf = list(zip(cv_model.vocabulary, idf_model.idf.toArray()))
#def func(data):
# for index in range(len(data)):
# data[index] = list(data[index])
# data[index].append(index)
# data[index][1] = float(data[index][1])
#func(keywords_list_with_idf)
#sc = spark.sparkContext
#rdd = sc.parallelize(keywords_list_with_idf)
#df = rdd.toDF(["keywords", "idf", "index"])
# df.write.insertInto('idf_keywords_values')
保存IDF结果,创建tfidf_keywords_values表
CREATE TABLE tfidf_keywords_values(
article_id INT comment "article_id",
channel_id INT comment "channel_id",
keyword STRING comment "keyword",
tfidf DOUBLE comment "tfidf");
计算IDF值
from pyspark.ml.feature import CountVectorizerModel
cv_model = CountVectorizerModel.load("hdfs://hadoop-master:9000/headlines/models/countVectorizerOfArticleWords.model")
from pyspark.ml.feature import IDFModel
idf_model = IDFModel.load("hdfs://hadoop-master:9000/headlines/models/IDFOfArticleWords.model")
cv_result = cv_model.transform(words_df)
tfidf_result = idf_model.transform(cv_result)
def func(partition):
TOPK = 20
for row in partition:
# 找到索引与IDF值并进行排序
_ = list(zip(row.idfFeatures.indices, row.idfFeatures.values))
_ = sorted(_, key=lambda x: x[1], reverse=True)
result = _[:TOPK]
for word_index, tfidf in result:
yield row.article_id, row.channel_id, int(word_index), round(float(tfidf), 4)
_keywordsByTFIDF = tfidf_result.rdd.mapPartitions(func).toDF(["article_id", "channel_id", "index", "tfidf"])
# 利用结果索引与”idf_keywords_values“合并知道词
keywordsIndex = ktt.spark.sql("select keyword, index idx from idf_keywords_values")
# 利用结果索引与”idf_keywords_values“合并知道词
keywordsByTFIDF = _keywordsByTFIDF.join(keywordsIndex, keywordsIndex.idx == _keywordsByTFIDF.index).select(["article_id", "channel_id", "keyword", "tfidf"])
# keywordsByTFIDF.write.insertInto("tfidf_keywords_values")
读取所有文章训练的词及IDF模型idf_keywords_values