python爬去淘宝西装数据
啥也不说代码先码上
#爬取数据
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import csv
import re
titles,prices,shops,sales = [],[],[],[]
#csvFile = open("xizhuang_women.csv","w",newline='') # newline=''解决空白行问题
csvFile = open("xizhuang_men.csv","w",newline='')
writer = csv.writer(csvFile)
writer.writerow(('title','price','shop','sale'))
def getInfo(page):
#url_women = "https://re.taobao.com/search?&extra=&refpid=420435_1006&keyword=%E8%A5%BF%E8%A3%85%20%E5%A5%B3&_input_charset=utf-8&page="+str(page)+"&isinner=0&rewriteKeyword"
url_men = "https://re.taobao.com/search?&extra=&refpid=420435_1006&keyword=%E8%A5%BF%E8%A3%85%20%E7%94%B7&_input_charset=utf-8&page="+str(page)+"&isinner=0&rewriteKeyword"
#配置headless
fireFoxOptions = webdriver.FirefoxOptions()
fireFoxOptions.set_headless() #设置为headless模式
driver = webdriver.Firefox(firefox_options=fireFoxOptions)
time.sleep(2)
# driver.get(url_women)
driver.get(url_men)
soup = BeautifulSoup(driver.page_source, 'html.parser')
titles = soup.findAll('span',class_='title')
prices = soup.findAll('span',class_='pricedetail')
shops = soup.findAll('span',class_='shopNick')
sales = soup.findAll('span',class_='payNum')
print(len(titles))
for i in range(len(titles)):
saleNum = re.findall(r"\d+\.?\d*",sales[i].get_text()) #提取销售数量数值
writer.writerow((titles[i].get_text(),prices[i].find('strong').get_text(),shops[i].get_text(),''.join(saleNum)))
driver.quit() # 表示关闭浏览器
for page in range(0,10): # 爬取前10页
print ("正在爬取第{}页".format(page))
getInfo(page)
csvFile.close() # 关闭文件
print("完成!")
# 第一步 计算TFIDF
import codecs
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import jieba
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
print('计算tfidf:')
# 语料库corpus构建
corpus = [] # 语料库
filepath_men = '.\\data\\xizhuang_men.csv'
filepath_women = '.\\data\\xizhuang_women.csv'
#def corpusCreate(path):
# file = open(path,'r')
# lines = file.readlines()
# file.close()
#
# # 只分词的语料库
# for line in lines:
# ziduan = line.split(',')
# title = ziduan[0]
# cut_text = jieba.cut(title) # 分词
# result = " ".join(cut_text)
# corpus.append(result)
#corpusCreate(filepath_men)
#corpusCreate(filepath_women)
# 去停用词的语料库
jiebaTxt = open('.\\output\\jieba.txt', mode = 'w') # 存储预处理后的文本结果
remove_words = [u'的', u',',u'和', u'是', u'随着', u'对于', u'对',u'等',u'能',u'都',u'。',u' ',u'、