# -*- coding:utf-8 -*- from Tkinter import * from ScrolledText import ScrolledText import urllib,requests import re import threading import sys url_name = [] a = 1 def get(): global a#全局变量 hd = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'} url = 'http://www.budejie.com/video/'+str(a) varl.set('已经获取到第%s页视频'%(a)) html = requests.get(url,headers=hd).text #print html a+=1 url_content= re.compile(r'<div class="j-r-list-c">.*?</div>.*?</div>',re.S) url_contents =re.findall(url_content,html) #print url_contents for i in url_contents: url_reg = r'data-mp4="(.*?)">'#正则表达式 url_items = re.findall(url_reg,i) #print url_items#视频列表 if url_items:#判断地址视频存不存在 name_reg = re.compile(r'<a href="/detail-.{8}?.html">(.*?)</\w',re.S) name_items = re.findall(name_reg,i) #print name_items for i,k in zip(name_items,url_items):#标题与视频结合 url_name.append([i,k]) print i,k return url_name id = 1 def write(): global id while id<10: url_name = get() for i in url_name: urllib.urlretrieve(i[1],'video\\%s.mp4' % (i[0])) text.insert(END,str(id)+'.'+i[1]+'\n'+i[0]+'\n') url_name.pop(0) id +=1 varl.set('抓取完毕') def start(): th = threading.Thread(target=write) th.start()#触发 root = Tk() root.title('视频爬取') root.geometry('+400+100')#指定位置 text = ScrolledText(root,font=('微软雅黑',10)) text.grid() button = Button(root,text='开始爬取',font=('微软雅黑',10),fg='blue',command=start) button.grid() varl = StringVar() label = Label(root,font=('微软雅黑',10),fg='black',textvariable = varl) label.grid() varl.set('已准备...') root.mainloop()
百思不得姐视频爬取
最新推荐文章于 2024-08-11 22:06:01 发布