项目综合：功能实现+界面

最新推荐文章于 2025-05-24 22:12:16 发布

原创最新推荐文章于 2025-05-24 22:12:16 发布 · 672 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#微信 #淘宝 #python #爬虫 #gui

python笔记同时被 2 个专栏收录

24 篇文章

订阅专栏

中央财经大学

7 篇文章

订阅专栏

一、文件命名：

tk_test为gui界面实现

total_selenium为微信公众号用selenium+chromedriver实现

final_test为淘宝的selenium + phantomjs实现

check_repeat通过对图片计算MD5的值来实现查重去重功能

mysql_py链接数据库存储一些东西，方便调用

二、淘宝部分

from selenium import webdriver
from bs4 import BeautifulSoup as bs
import re
import urllib
from tkinter import *
import threading
from mysql_py import *

#new list
store_list = []

#init total_count
total_count = 0

#init mutex
mutex = threading.Lock()

#img store list
store_img_list = []

#init item href list
href_list = []

#init db cursor for test
db,db_cursor = init_fun("taobao")

#init text
state_t = 0
result_t = 0

#init item name list
item_list = []

def get_item_href():
    global href_list
    global result_t
    global state_t
    state_t.insert(END,"正在查找所有热门类目\n")
    driver = webdriver.PhantomJS()
    driver.get("https://shopsearch.taobao.com/search?app=shopsearch&q=&imgfile=&commend=all&ssid=s5-e&search_type=shop&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306")
    result_t.insert(END,"一共有12个热门类目:\n")
    for i in range(12):
        href_list.append(driver.find_element_by_xpath('//*[@id="shopsearchindex-hotcat"]/div/div/ul/li[%s]/a'%(i+1)).get_attribute('href'))
        item_list.append(driver.find_element_by_xpath('//*[@id="shopsearchindex-hotcat"]/div/div/ul/li[%s]/a'%(i+1)).text)
        href_list[i] += '&sort=sale-desc'
        result_t.insert(END,driver.find_element_by_xpath('//*[@id="shopsearchindex-hotcat"]/div/div/ul/li[%s]/a'%(i+1)).text+":")
        result_t.insert(END,href_list[i]+'\n')
    
def get_shop_url(store_list,start_url):
    count = 0
    while count < 5:
        url = start_url + '&s=%s'%(count*20)
        driver = webdriver.PhantomJS()
        driver.get(url)
        page = driver.page_source
        urls = re.findall(r'//shop\d+.taobao.com',page,re.I)
        for url in urls:
            url = get_total_url(url)
            if url not in store_list:
                store_list.append(url)
        count += 1

def get_img_url(shop_url,connect,cursor):
    global result_t
    global state_t
    global total_count
    mutex.acquire()
    driver = webdriver.PhantomJS()
    driver.set_window_size(25600,14400)
    driver.get(shop_url)
    page_source = driver.page_source
    img_urls = re.findall(r'[https:]?//gdp.alicdn.com/.*?.jpg',page_source,re.I) + re.findall(r'[https:]?//img.alicdn.com/.*?.jpg',page_source,re.I)
    download_path = r'C:\Users\Mr.Guo\Pictures\taobao'
    failed = 0
    for count in range(len(img_urls)):
        img_url = get_total_url(img_urls[count])
        if img_url not in store_img_list:
            try:
                store_name = "%s"%total_count+"_"+"%s"%count
                urllib.request.urlretrieve(img_url,download_path + r"\%s.jpeg"%store_name)
                state_t.insert(END,"download %s.jpeg\n"%store_name)
                insert_into_table(connect,cursor,store_name,shop_url)
            except Exception as e:
                print(e)
                failed += 1
                pass
    result_t.insert(END,"第%s个店铺主页图片下载完成,共下载了%s张图片\n"%(total_count+1,count-failed))
    total_count += 1
    mutex.release()

def get_total_url(url):
    if url.startswith('//'):
        url = 'https:' + url
    elif url.startswith('/'):
        url = 'https:/' + url
    elif url.startswith(':'):
        url = "https" + url
    else:
        url = url
    return url

def taobao_main():
    global state_t
    global result_t
    t = threading.Thread(target = get_item_href,args = ())
    t.start()
    t.join()
    result_t.insert(END,"----------------------------------------------------------------------")
    shop_threads = []
    for i in range(len(href_list)):
        start_url = href_list[i]
        state_t.insert(END,"正在获取%s类目下的销量前100的店铺url\n"%item_list[i])
        t = threading.Thread(target = get_shop_url,args = (store_list,start_url,))
        t.start()
        shop_threads.append(t)
    for i in shop_threads:
        i.join()
    img_threads = []
    for shop_url in store_list:
        state_t.insert(END,"正在下载店铺首页的图片:"+shop_url+'\n')
        t = threading.Thread(target = get_img_url,args = (shop_url,db,db_cursor,))
        t.start()
        img_threads.append(t)
    for i in img_threads:
        t.join()
    state_t.insert(END,"爬虫完成,结束")

def tb_get_text(t1,t2):
    global state_t
    global result_t
    state_t = t1
    result_t = t2

def tb_thread():
    main_t = threading.Thread(target = taobao_main,args = ())
    return main_t

#get_item_href()

三、微信部分

from selenium import webdriver
import time
from bs4 import BeautifulSoup
import threading
from mysql_py import *
import re
from tkinter import *

#init header
header = {
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate',
    'Accept-Language':'zh-CN,zh;q=0.9',
    'Connection':'keep-alive',
    'Host':'weixin.sogou.com',
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
    }

#init count
global_count = 0
download_count = 0

#init topic,text
topic = ""
state_t = 0
result_t = 0

#init mutex
mutex = threading.Lock()

#init get img function
def get_img(url,num,connect,cursor):
    global state_t
    global result_t
    global download_count
    mutex.acquire()
    response = requests.get(url,headers = header).content
    content = str(response,encoding = "utf-8")
    bs_obj = BeautifulSoup(content,"html.parser")
    img_list = bs_obj.findAll("img")
    count = 0
    failed = 0
    for img in img_list:
        try:
            imgurl=get_total_url(img.attrs["data-src"])
            store_name = "%s"%url_num+"%s"%count
            path = r"C:\Users\Mr.Guo\Pictures\weixin"
            check_mkdir(path)
            urllib.request.urlretrieve(imgurl,r"C:\Users\Mr.Guo\Pictures\weixin\%s.jpeg" %store_name)
            state_t.insert(END,"download %s.jpeg"%store_name)
            insert_into_table(connect,cursor,store_name,html)
            count += 1
        except Exception as e:
            failed += 1
            pass
    download_count += 1
    result_t.insert(END,"第%s篇文章图片下载完成，共下载%s张图片\n"%(download_count+1,count - failed))
    mutex.release()

#selenium + chromedrive to get articles'url
def get_url_list(driver,url_list,page_num):
    global global_count
    global state_t
    global result_t
    for i in range(page_num):
        page_source = driver.page_source
        bs_obj = BeautifulSoup(page_source,"html.parser")
        one_url_list = bs_obj.findAll("div",{"class":"txt-box"})
        for url in one_url_list:
            url_list.append(url.h3.a.attrs['href'])
            state_t.insert(END,"文章地址:"+url.h3.a.attrs['href']+'\n')
            try:
                date = url.div.span.text
                pattern = re.compile("\d+",re.S)
                article_time = pattern.findall(date)[0]
                article_time = time.strftime('%Y-%m-%d',time.localtime(int(article_time)))
                result_t.insert(END,"文章名：%s\n来源公众号：%s\n发表日期：%s\n"%(url.h3.a.text,url.div.a.text,article_time))
            except Exception as e:
                #print(e)
                result_t.insert(END,"文章名：%s\n来源公众号：%s\n发表日期：%s\n"%(url.h3.a.text,url.div.a.text,url.div.span.text))
            result_t.insert(END,"----------------------------------------------------------------------")
        global_count += 1
        next_page = "http://weixin.sogou.com/weixin" + bs_obj.find("a",{"id":"sogou_next"}).attrs['href']
        driver.get(next_page)
        time.sleep(1)

#main function
def sougou_main():
    global topic,state_t,result_t
    #init db
    connect,cursor = init_fun('weixin')
    check_tab_exist(connect,cursor)

    state_t.insert(END,"正在登录中\n")
    #init driver
    driver = webdriver.Chrome()
    driver.get("http://weixin.sogou.com/")
    driver.find_element_by_xpath('//*[@id="loginBtn"]').click()
    time.sleep(5)
    result_t.insert(END,"登陆成功\n")
    
    #enter search word and send it
    find = topic
    driver.find_element_by_xpath('//*[@id="query"]').send_keys("%s"%find)
    driver.find_element_by_xpath('//*[@id="searchForm"]/div/input[3]').click()
    time.sleep(2)
    
    #get articles num to get page nums
    count = driver.find_element_by_xpath('//*[@id="pagebar_container"]/div').text
    pattern = re.compile(r'\d+,\d+')
    try:
        num = pattern.findall(count,re.I)[0].replace(',',"")
        result_t.insert(END,"一共有%s条公众号文章\n"%num)
    except:
        pattern = re.compile(r'\d+')
        num = pattern.findall(count,re.I)[0]
        result_t.insert(END,"一共有%s条公众号文章\n"%num)
    page_num = int(int(num)/10)

    #get articles url
    url_list = []
    state_t.insert(END,"正在获取每页文章地址")
    get_url_list(driver,url_list,page_num)

    state_t.insert(END,"正在获取每篇文章图片")
    #create threads to get imgs and store in db
    threads = []
    for url_num in range(len(url_list)):
        t = threading.Thread(target = get_img,args = (url_list[url_num],url_num,connect,cursor,))
        t.start()
        threads.append(t)
    for t in threads:
        t.join()

def wx_get_text(t1,t2):
    global state_t
    global result_t
    state_t = t1
    result_t = t2
    
def change_topic(t):
    global topic
    topic = t

def wx_thread():
    main_t = threading.Thread(target = sougou_main,args = ())
    return main_t

四、数据库部分

import pymysql

#init
def init_fun(db_name):
    connect = pymysql.connect(host = "localhost",port = 3306,user = "root",passwd = "123456",db = db_name)
    cursor = connect.cursor()
    return connect,cursor

#init table
def check_tab_exist(connect,cursor):
    cursor.execute("create table if not exists Pictures(pic_name char(255) not null primary key,shop_url char(255))")
    connect.commit()
    
#insert items into table
def insert_into_table(connect,cursor,pic_name,shop_url):
    cursor.execute("insert into Pictures(pic_name,shop_url)value('%s','%s')"%(pic_name,shop_url))
    connect.commit()
    
#delete items from table
def delete_from_table(connect,cursor,pic_name):
    cursor.execute("delete from Pictures where pic_name = '%s'"%pic_name)
    connect.commit()

def select_from_table(connect,cursor):
    cursor.execute("select pic_name from pictures")
    result = cursor.fetchall()
    return_list = []
    for i in result:
        return_list.append(i[0])
    return return_list

'''    
#test
connect,cursor = init_fun("test")
check_tab_exist(connect,cursor)

count = 3
for i in range(count):
    pic_name = "test" + "%s"%i
    insert_into_table(connect,cursor,pic_name,"fuck")
for i in range(count):
    pic_name = "test" + "%s"%i
    delete_from_table(connect,cursor,pic_name)
'''

五、去重部分

import hashlib
from mysql_py import *
import os


def check_repeat_fun(name):
    image_file = r'C:\Users\Mr.Guo\Pictures\%s'%name
    image_db,image_db_cursor = init_fun('%s'%name)

    image_name_list = select_from_table(image_db,image_db_cursor)
    #print(image_name_list)
    md5_list = []

    for image_name in image_name_list:
        path = image_file + r'\%s.jpeg'%image_name
        #print(path)
        image = open(path,'rb').read()
        md5 = hashlib.md5(image).hexdigest()
        if md5 in md5_list:
            os.remove(path)
            delete_from_table(image_db,image_db_cursor,image_name)
        else:
            md5_list.append(md5)
            
#check_repeat_fun("taobao")

六、界面设计部分

from tkinter import *
from tkinter import ttk
from final_test import *
import threading
from total_selenium import *
#from check_repeat import *

#init threads
test_tb_thread = tb_thread()
test_wx_thread = wx_thread()

#def funcs
def get_var_change():
    t = var.get()
    change_topic(t)
    
#init tk root
tk = Tk()
tk.title("基于卷积神经网络的图像识别系统——郭磊 and 郭桐嘉")
tk.geometry('1300x768')
tab_control = ttk.Notebook(tk)

tb_tab = ttk.Frame(tab_control)
tab_control.add(tb_tab,text = "淘宝")

#功能标签
tb_func_label = ttk.LabelFrame(tb_tab,text = "功能")
tb_func_label.grid(column = 0,row = 1,sticky = W)
ttk.Label(tb_func_label,text = '通用功能').grid(column = 0,row = 0)

#功能按钮
Button(tb_tab,text = "淘宝爬虫",command = test_tb_thread.start,width = 7,height = 1).grid(column = 1,row = 1,sticky = W+N,pady = 10)
Button(tb_tab,text = "人脸识别",command = lambda : print(1),width = 7,height = 1).grid(column = 2,row = 1,sticky = W+N,padx = 10,pady = 10)

#执行状态
Label(tb_tab,text = "运行状态",fg = "blue").grid(column = 3,row = 1,sticky = W+S)
state_s = ttk.Scrollbar(tb_tab)
state_t = Text(tb_tab,width = 70,height = 50)
state_t.configure(yscrollcommand = state_s.set)
state_t.grid(column = 3,row = 2)
state_s['command'] = state_t.yview
state_s.grid(column = 4,row = 2,sticky = N+S)

#执行结果
Label(tb_tab,text = "运行结果",fg = "blue").grid(column = 5,row = 1,sticky = W+S)
result_s = ttk.Scrollbar(tb_tab)
result_t = Text(tb_tab,width = 70,height = 50)
result_t.configure(yscrollcommand = result_s.set)
result_t.grid(column = 5,row = 2)
result_s['command'] = result_t.yview
result_s.grid(column = 6,row = 2,sticky = N+S)

#布局tab control
tab_control.grid(column = 0,row = 0,sticky = W)

wx_tab = ttk.Frame(tab_control)
tab_control.add(wx_tab,text = "微信")

#功能标签2
wx_func_label = ttk.LabelFrame(wx_tab,text = "功能")
wx_func_label.grid(column = 0,row = 1,sticky = W)
ttk.Label(wx_func_label,text = '话题输入').grid(column = 0,row = 0)

#输入框
var = StringVar()
wx_entry = Entry(wx_tab,textvariable = var,width = 8)
wx_entry.grid(column = 1,row = 1,sticky = E)

#确认输入按钮
Button(wx_tab,text = "确认话题",command = get_var_change).grid(column = 2,row = 1)

#功能按钮
Button(wx_tab,text = "微信爬虫",command = test_wx_thread.start,width = 7,height = 1).grid(column = 1,row = 2,pady = 10,sticky = W+N+E)
Button(wx_tab,text = "人脸识别",command = lambda : print(1),width = 7,height = 1).grid(column = 2,row = 2,padx = 10,pady = 10,sticky = W+N)

#功能标签2
wx_func_label2 = ttk.LabelFrame(wx_tab,text = "功能")
wx_func_label2.grid(column = 0,row = 2,sticky = W+N)
ttk.Label(wx_func_label2,text = '通用功能').grid(column = 0,row = 0,sticky = W+N)

#执行状态
Label(wx_tab,text = "运行状态",fg = "blue").grid(column = 3,row = 1,sticky = W+S)
wx_state_s = ttk.Scrollbar(wx_tab)
wx_state_t = Text(wx_tab,width = 70,height = 50)
wx_state_t.configure(yscrollcommand = wx_state_s.set)
wx_state_t.grid(column = 3,row = 2,rowspan = 5)
wx_state_s['command'] = state_t.yview
wx_state_s.grid(column = 4,row = 2,rowspan = 5,sticky = N+S)

#执行结果
Label(wx_tab,text = "运行结果",fg = "blue").grid(column = 5,row = 1,sticky = W+S)
wx_result_s = ttk.Scrollbar(wx_tab)
wx_result_t = Text(wx_tab,width = 70,height = 50)
wx_result_t.configure(yscrollcommand = wx_result_s.set)
wx_result_t.grid(column = 5,row = 2,rowspan = 5)
wx_result_s['command'] = result_t.yview
wx_result_s.grid(column = 6,row = 2,rowspan = 5,sticky = N+S)

#获取text
tb_get_text(state_t,result_t)
wx_get_text(wx_state_t,wx_result_t)

tk.mainloop()