在线演示网址 https://www.886it.cn/pain-point-market/index
目录
一 数据采集和清洗
二 模型识别 NER QA BERT 聚类
三 翻译
四 数据展示,封装API
一数据采集和清洗
1亚马逊的爬虫部分移步到爬虫采集 介绍 高性能亚马逊爬虫
2数据清洗函数
数据入库后需要对于特殊非法字符字符进行清洗工作,祛除多余空格换行
请看代码 函数 def delete_sysbol
#istxt 是否是文本
# -*- coding: utf-8 -*-
import re, pprint
import time, datetime,re
# 关键词抽取
def keywords_extraction(text):
tr4w = TextRank4Keyword(allow_speech_tags=['n', 'nr', 'nrfg', 'ns', 'nt', 'nz'])
# allow_speech_tags --词性列表,用于过滤某些词性的词
tr4w.analyze(text=text, window=2, lower=True, vertex_source='all_filters', edge_source='no_stop_words',
pagerank_config={'alpha': 0.85, })
# text -- 文本内容,字符串
# window -- 窗口大小,int,用来构造单词之间的边。默认值为2
# lower -- 是否将英文文本转换为小写,默认值为False
# vertex_source -- 选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来构造pagerank对应的图中的节点
# -- 默认值为`'all_filters'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'
# edge_source -- 选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来构造pagerank对应的图中的节点之间的边
# -- 默认值为`'no_stop_words'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'`。边的构造要结合`window`参数
# pagerank_config -- pagerank算法参数配置,阻尼系数为0.85
keywords = tr4w.get_keywords(num=16, word_min_len=2)
# num -- 返回关键词数量
# word_min_len -- 词的最小长度,默认值为1
return keywords
# 关键短语抽取
def keyphrases_extraction(text):
tr4w = TextRank4Keyword()
tr4w.analyze(text=text, window=2, lower=True, vertex_source='all_filters', edge_source='no_stop_words',
pagerank_config={'alpha': 0.85, })
keyphrases = tr4w.get_keyphrases(keywords_num=6, min_occur_num=1)
# keywords_num -- 抽取的关键词数量
# min_occur_num -- 关键短语在文中的最少出现次数
return keyphrases
# 关键句抽取
def keysentences_extraction(text):
tr4s = TextRank4Sentence()
tr4s.analyze(text, lower=True, source='all_filters')
# text -- 文本内容,字符串
# lower -- 是否将英文文本转换为小写,默认值为False
# source -- 选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来生成句子之间的相似度。
# -- 默认值为`'all_filters'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'
# sim_func -- 指定计算句子相似度的函数
# 获取最重要的num个长度大于等于sentence_min_len的句子用来生成摘要
keysentences = tr4s.get_key_sentences(num=3, sentence_min_len=6)
return keysentences
def keywords_textrank(text):
keywords = jieba.analyse.textrank(text, topK=6)
return keywords
def clearn_url(line):
results = re.compile(r'http://[a-zA-Z0-9.?/&=:]*', re.S)
line = results.sub("", line)
return line
def clearn_emoji(desstr):
try:
co = re.compile(u'[\U00010000-\U0010ffff]')
except re.error:
co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
return co.sub("", desstr)
def delete_double_space(line):
while 1:
#print(line," " in line)
if(" " in line):
line=line.replace(" ", " ")
else:
return line
#istxt 是否是文本
def delete_sysbol(line,istxt=False):
line=str(line)
r1 = "[\^+~@#¥……&*()\(\)]"
line = re.sub(r1, '', line)
r1 = "[↑↓←→↖↗↘↙↔↕➻➼➽➸➳➺➻➴➵➶➷➹▶►▷◁◀◄«»➩➪➫➬➭➮➯➱⏎➲➾♥➔⭐❤]"
line = re.sub(r1, '', line)
r1 = "[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳⓪]"
line = re.sub(r1, '', line)
r1 = "[﹢﹣×÷±/=≌∽≦≧≒﹤﹥≈≡≠=≤≥<>≮≯∷∶∫∮∝∞∧∨∑∏∪∩∈∵∴⊥∥∠⌒⊙⊕√∟⊿㏒㏑]"
line = re.sub(r1, '', line)
r1 = "[°′″℃℉Å﹪‰㎡㏕㎜㎝㎞㏎³㎎㎏㏄º○¤%⁴⁵⁶⁷⁸⁹⁰⁺⁻⁼⁽⁾ʲʰʳʷʸⁿ]"
line = re.sub(r1, '', line)
r1 = "[‘“”〝〞ˆˇ﹔¨…¸;´~—ˉ|‖〃`@﹫¡¿﹏﹋﹌︴々﹟#﹩$﹠&﹪*﹡﹢﹦﹤]"
line = re.sub(r1, '', line)
r1 = "[\[\]{}-]"
line = re.sub(r1, ' ', line)
line = line.replace("\r\n"," ").replace("\n", " ").replace("\r", " ")
line= delete_double_space(line)
line=line.strip()
return line
def clearn_en_review(line):
line=clearn_emoji(line)
line=delete_sysbol(line,istxt=False)
return line
def gesim_t():
from gensim.models.word2vec import LineSentence
bigram_model_filepath="D:\python\code/tag\ext\gensim/bigram_model"
trigram_model_filepath="D:\python\code/tag\ext\gensim/trigram_model"
from gensim.models import Phrases
bigram_model = Phrases.load(bigram_model_filepath)
trigram_model = Phrases.load(trigram_model_filepath)
trigram_model.add_vocab([["would","always"], ["meow"]])
bigram_model.add_vocab([["trees", "graph"], ["meow"]])
trigram_model = trigram_model.freeze()
bigram_model = bigram_model.freeze()
# Build the bigram and trigram models
new_sentence = ["as", "sim","card", "would","always"]
# See trigram example
print(bigram_model[new_sentence])
print(trigram_model[bigram_model[new_sentence]])
if __name__ == '__main__':
test="12345678/asdas"
test2 = "12345"
print(test.split("/")[0])
print(test2.split("/")[0])
在这里插入代码片