一、文件命名:
tk_test为gui界面实现
total_selenium为微信公众号用selenium+chromedriver实现
final_test为淘宝的selenium + phantomjs实现
check_repeat通过对图片计算MD5的值来实现查重去重功能
mysql_py链接数据库存储一些东西,方便调用
二、淘宝部分
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import re
import urllib
from tkinter import *
import threading
from mysql_py import *
#new list
store_list = []
#init total_count
total_count = 0
#init mutex
mutex = threading.Lock()
#img store list
store_img_list = []
#init item href list
href_list = []
#init db cursor for test
db,db_cursor = init_fun("taobao")
#init text
state_t = 0
result_t = 0
#init item name list
item_list = []
def get_item_href():
global href_list
global result_t
global state_t
state_t.insert(END,"正在查找所有热门类目\n")
driver = webdriver.PhantomJS()
driver.get("https://shopsearch.taobao.com/search?app=shopsearch&q=&imgfile=&commend=all&ssid=s5-e&search_type=shop&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306")
result_t.insert(END,"一共有12个热门类目:\n")
for i in range(12):
href_list.append(driver.find_element_by_xpath('//*[@id="shopsearchindex-hotcat"]/div/div/ul/li[%s]/a'%(i+1)).get_attribute('href'))
item_list.append(driver.find_element_by_xpath('//*[@id="shopsearchindex-hotcat"]/div/div/ul/li[%s]/a'%(i+1)).text)
href_list[i] += '&sort=sale-desc'
result_t.insert(END,driver.find_element_by_xpath('//*[@id="shopsearchindex-hotcat"]/div/div/ul/li[%s]/a'%(i+1)).text+":")
result_t.insert(END,href_list[i]+'\n')
def get_shop_url(store_list,start_url):
count = 0
while count < 5:
url = start_url + '&s=%s'%(count*20)
driver = webdriver.PhantomJS()
driver.get(url)
page = driver.page_source
urls = re.findall(r'//shop\d+.taobao.com',page,re.I)
for url in urls:
url = get_total_url(url)
if url not in store_list:
store_list.append(url)
count += 1
def get_img_url(shop_url,connect,cursor):
global result_t
global state_t
global total_count
mutex.acquire()
driver = webdriver.PhantomJS()
driver.set_window_size(25600,14400)
driver.get(shop_url)
page_source = driver.page_source
img_urls = re.findall(r'[https:]?//gdp.alicdn.com/.*?.jpg',page_source,re.I) + re.findall(r'[https:]?//img.alicdn.com/.*?.jpg',page_source,re.I)
download_path = r'C:\Users\Mr.Guo\Pictures\taobao'
failed = 0
for count in range(len(img_urls)):
img_url = get_total_url(img_urls[count])
if img_url not in store_img_list:
try:
store_name = "%s"%total_count+"_"+"%s"%count
urllib.request.urlretrieve(img_url,download_path + r"\%s.jpeg"%store_name)
state_t.insert(END,"download %s.jpeg\n"%store_name)
insert_into_table(connect,cursor,store_name,shop_url)
except Exception as e:
print(e)
failed += 1
pass
result_t.insert(END,"第%s个店铺主页图片下载完成,共下载了%s张图片\n"%(total_count+1,count-failed))
total_count += 1
mutex.release()
def get_total_url(url):
if url.startswith('//'):
url = 'https:' + url
elif url.startswith('/'):
url = 'https:/' + url
elif url.startswith(':'):
url = "https" + url
else:
url = url
return url
def taobao_main():
global state_t
global result_t
t = threading.Thread(target = get_item_href,args = ())
t.start()
t.join()
result_t.insert(END,"----------------------------------------------------------------------")
shop_threads = []
for i in range(len(href_list)):
start_url = href_list[i]
state_t.insert(END,"正在获取%s类目下的销量前100的店铺url\n"%item_list[i])
t = threading.Thread(target = get_shop_url,args = (store_list,start_url,))
t.start()
shop_threads.append(t)
for i in shop_threads:
i.join()
img_threads = []
for shop_url in store_list:
state_t.insert(END,"正在下载店铺首页的图片:"+shop_url+'\n')
t = threading.Thread(target = get_img_url,args = (shop_url,db,db_cursor,))
t.start()
img_threads.append(t)
for i in img_threads:
t.join()
state_t.insert(END,"爬虫完成,结束")
def tb_get_text(t1,t2):
global state_t
global result_t
state_t = t1
result_t = t2
def tb_thread():
main_t = threading.Thread(target = taobao_main,args = ())
return main_t
#get_item_href()
三、微信部分
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import threading
from mysql_py import *
import re
from tkinter import *
#init header
header = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection':'keep-alive',
'Host':'weixin.sogou.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
}
#init count
global_count = 0
download_count = 0
#init topic,text
topic = ""
state_t = 0
result_t = 0
#init mutex
mutex = threading.Lock()
#init get img function
def get_img(url,num,connect,cursor):
global state_t
global result_t
global download_count
mutex.acquire()
response = requests.get(url,headers = header).content
content = str(response,encoding = "utf-8")
bs_obj = BeautifulSoup(content,"html.parser")
img_list = bs_obj.findAll("img")
count = 0
failed = 0
for img in img_list:
try:
imgurl=get_total_url(img.attrs["data-src"])
store_name = "%s"%url_num+"%s"%count
path = r"C:\Users\Mr.Guo\Pictures\weixin"
check_mkdir(path)
urllib.request.urlretrieve(imgurl,r"C:\Users\Mr.Guo\Pictures\weixin\%s.jpeg" %store_name)
state_t.insert(END,"download %s.jpeg"%store_name)
insert_into_table(connect,cursor,store_name,html)
count += 1
except Exception as e:
failed += 1
pass
download_count += 1
result_t.insert(END,"第%s篇文章图片下载完成,共下载%s张图片\n"%(download_count+1,count - failed))
mutex.release()
#selenium + chromedrive to get articles'url
def get_url_list(driver,url_list,page_num):
global global_count
global state_t
global result_t
for i in range(page_num):
page_source = driver.page_source
bs_obj = BeautifulSoup(page_source,"html.parser")
one_url_list = bs_obj.findAll("div",{"class":"txt-box"})
for url in one_url_list:
url_list.append(url.h3.a.attrs['href'])
state_t.insert(END,"文章地址:"+url.h3.a.attrs['href']+'\n')
try:
date = url.div.span.text
pattern = re.compile("\d+",re.S)
article_time = pattern.findall(date)[0]
article_time = time.strftime('%Y-%m-%d',time.localtime(int(article_time)))
result_t.insert(END,"文章名:%s\n来源公众号:%s\n发表日期:%s\n"%(url.h3.a.text,url.div.a.text,article_time))
except Exception as e:
#print(e)
result_t.insert(END,"文章名:%s\n来源公众号:%s\n发表日期:%s\n"%(url.h3.a.text,url.div.a.text,url.div.span.text))
result_t.insert(END,"----------------------------------------------------------------------")
global_count += 1
next_page = "http://weixin.sogou.com/weixin" + bs_obj.find("a",{"id":"sogou_next"}).attrs['href']
driver.get(next_page)
time.sleep(1)
#main function
def sougou_main():
global topic,state_t,result_t
#init db
connect,cursor = init_fun('weixin')
check_tab_exist(connect,cursor)
state_t.insert(END,"正在登录中\n")
#init driver
driver = webdriver.Chrome()
driver.get("http://weixin.sogou.com/")
driver.find_element_by_xpath('//*[@id="loginBtn"]').click()
time.sleep(5)
result_t.insert(END,"登陆成功\n")
#enter search word and send it
find = topic
driver.find_element_by_xpath('//*[@id="query"]').send_keys("%s"%find)
driver.find_element_by_xpath('//*[@id="searchForm"]/div/input[3]').click()
time.sleep(2)
#get articles num to get page nums
count = driver.find_element_by_xpath('//*[@id="pagebar_container"]/div').text
pattern = re.compile(r'\d+,\d+')
try:
num = pattern.findall(count,re.I)[0].replace(',',"")
result_t.insert(END,"一共有%s条公众号文章\n"%num)
except:
pattern = re.compile(r'\d+')
num = pattern.findall(count,re.I)[0]
result_t.insert(END,"一共有%s条公众号文章\n"%num)
page_num = int(int(num)/10)
#get articles url
url_list = []
state_t.insert(END,"正在获取每页文章地址")
get_url_list(driver,url_list,page_num)
state_t.insert(END,"正在获取每篇文章图片")
#create threads to get imgs and store in db
threads = []
for url_num in range(len(url_list)):
t = threading.Thread(target = get_img,args = (url_list[url_num],url_num,connect,cursor,))
t.start()
threads.append(t)
for t in threads:
t.join()
def wx_get_text(t1,t2):
global state_t
global result_t
state_t = t1
result_t = t2
def change_topic(t):
global topic
topic = t
def wx_thread():
main_t = threading.Thread(target = sougou_main,args = ())
return main_t
四、数据库部分
import pymysql
#init
def init_fun(db_name):
connect = pymysql.connect(host = "localhost",port = 3306,user = "root",passwd = "123456",db = db_name)
cursor = connect.cursor()
return connect,cursor
#init table
def check_tab_exist(connect,cursor):
cursor.execute("create table if not exists Pictures(pic_name char(255) not null primary key,shop_url char(255))")
connect.commit()
#insert items into table
def insert_into_table(connect,cursor,pic_name,shop_url):
cursor.execute("insert into Pictures(pic_name,shop_url)value('%s','%s')"%(pic_name,shop_url))
connect.commit()
#delete items from table
def delete_from_table(connect,cursor,pic_name):
cursor.execute("delete from Pictures where pic_name = '%s'"%pic_name)
connect.commit()
def select_from_table(connect,cursor):
cursor.execute("select pic_name from pictures")
result = cursor.fetchall()
return_list = []
for i in result:
return_list.append(i[0])
return return_list
'''
#test
connect,cursor = init_fun("test")
check_tab_exist(connect,cursor)
count = 3
for i in range(count):
pic_name = "test" + "%s"%i
insert_into_table(connect,cursor,pic_name,"fuck")
for i in range(count):
pic_name = "test" + "%s"%i
delete_from_table(connect,cursor,pic_name)
'''
五、去重部分
import hashlib
from mysql_py import *
import os
def check_repeat_fun(name):
image_file = r'C:\Users\Mr.Guo\Pictures\%s'%name
image_db,image_db_cursor = init_fun('%s'%name)
image_name_list = select_from_table(image_db,image_db_cursor)
#print(image_name_list)
md5_list = []
for image_name in image_name_list:
path = image_file + r'\%s.jpeg'%image_name
#print(path)
image = open(path,'rb').read()
md5 = hashlib.md5(image).hexdigest()
if md5 in md5_list:
os.remove(path)
delete_from_table(image_db,image_db_cursor,image_name)
else:
md5_list.append(md5)
#check_repeat_fun("taobao")
六、界面设计部分
from tkinter import *
from tkinter import ttk
from final_test import *
import threading
from total_selenium import *
#from check_repeat import *
#init threads
test_tb_thread = tb_thread()
test_wx_thread = wx_thread()
#def funcs
def get_var_change():
t = var.get()
change_topic(t)
#init tk root
tk = Tk()
tk.title("基于卷积神经网络的图像识别系统——郭磊 and 郭桐嘉")
tk.geometry('1300x768')
tab_control = ttk.Notebook(tk)
tb_tab = ttk.Frame(tab_control)
tab_control.add(tb_tab,text = "淘宝")
#功能标签
tb_func_label = ttk.LabelFrame(tb_tab,text = "功能")
tb_func_label.grid(column = 0,row = 1,sticky = W)
ttk.Label(tb_func_label,text = '通用功能').grid(column = 0,row = 0)
#功能按钮
Button(tb_tab,text = "淘宝爬虫",command = test_tb_thread.start,width = 7,height = 1).grid(column = 1,row = 1,sticky = W+N,pady = 10)
Button(tb_tab,text = "人脸识别",command = lambda : print(1),width = 7,height = 1).grid(column = 2,row = 1,sticky = W+N,padx = 10,pady = 10)
#执行状态
Label(tb_tab,text = "运行状态",fg = "blue").grid(column = 3,row = 1,sticky = W+S)
state_s = ttk.Scrollbar(tb_tab)
state_t = Text(tb_tab,width = 70,height = 50)
state_t.configure(yscrollcommand = state_s.set)
state_t.grid(column = 3,row = 2)
state_s['command'] = state_t.yview
state_s.grid(column = 4,row = 2,sticky = N+S)
#执行结果
Label(tb_tab,text = "运行结果",fg = "blue").grid(column = 5,row = 1,sticky = W+S)
result_s = ttk.Scrollbar(tb_tab)
result_t = Text(tb_tab,width = 70,height = 50)
result_t.configure(yscrollcommand = result_s.set)
result_t.grid(column = 5,row = 2)
result_s['command'] = result_t.yview
result_s.grid(column = 6,row = 2,sticky = N+S)
#布局tab control
tab_control.grid(column = 0,row = 0,sticky = W)
wx_tab = ttk.Frame(tab_control)
tab_control.add(wx_tab,text = "微信")
#功能标签2
wx_func_label = ttk.LabelFrame(wx_tab,text = "功能")
wx_func_label.grid(column = 0,row = 1,sticky = W)
ttk.Label(wx_func_label,text = '话题输入').grid(column = 0,row = 0)
#输入框
var = StringVar()
wx_entry = Entry(wx_tab,textvariable = var,width = 8)
wx_entry.grid(column = 1,row = 1,sticky = E)
#确认输入按钮
Button(wx_tab,text = "确认话题",command = get_var_change).grid(column = 2,row = 1)
#功能按钮
Button(wx_tab,text = "微信爬虫",command = test_wx_thread.start,width = 7,height = 1).grid(column = 1,row = 2,pady = 10,sticky = W+N+E)
Button(wx_tab,text = "人脸识别",command = lambda : print(1),width = 7,height = 1).grid(column = 2,row = 2,padx = 10,pady = 10,sticky = W+N)
#功能标签2
wx_func_label2 = ttk.LabelFrame(wx_tab,text = "功能")
wx_func_label2.grid(column = 0,row = 2,sticky = W+N)
ttk.Label(wx_func_label2,text = '通用功能').grid(column = 0,row = 0,sticky = W+N)
#执行状态
Label(wx_tab,text = "运行状态",fg = "blue").grid(column = 3,row = 1,sticky = W+S)
wx_state_s = ttk.Scrollbar(wx_tab)
wx_state_t = Text(wx_tab,width = 70,height = 50)
wx_state_t.configure(yscrollcommand = wx_state_s.set)
wx_state_t.grid(column = 3,row = 2,rowspan = 5)
wx_state_s['command'] = state_t.yview
wx_state_s.grid(column = 4,row = 2,rowspan = 5,sticky = N+S)
#执行结果
Label(wx_tab,text = "运行结果",fg = "blue").grid(column = 5,row = 1,sticky = W+S)
wx_result_s = ttk.Scrollbar(wx_tab)
wx_result_t = Text(wx_tab,width = 70,height = 50)
wx_result_t.configure(yscrollcommand = wx_result_s.set)
wx_result_t.grid(column = 5,row = 2,rowspan = 5)
wx_result_s['command'] = result_t.yview
wx_result_s.grid(column = 6,row = 2,rowspan = 5,sticky = N+S)
#获取text
tb_get_text(state_t,result_t)
wx_get_text(wx_state_t,wx_result_t)
tk.mainloop()