import urllib.request
import os
import random
import time
import re
from selenium import webdriver
#browser=webdriver.Firefox(executable_path='/home/renpu/pachong/geckodriver')
def url_open(url):
'''iplist=['218.20.55.185:9999','39.104.13.145:8080']
a=random.choice(iplist)
print(a)
proxy_support=urllib.request.ProxyHandler({'http':a})
opener = urllib.request.build_opener(proxy_support)
opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0')]
urllib.request.install_opener(opener)'''
headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
req = urllib.request.Request(url=url, headers=headers)
respone = urllib.request.urlopen(req)
return respone.read()
def url_webdriver(url,browser):
browser.get(url)
html=browser.page_source.encode().decode('utf-8')
with open('jingdongweiber.txt','w') as fjingdong:
fjingdong.write(html)
img=r'(?:(?://.{3,150}?)\.jpg)'
imglist=re.findall(img,html)
img_name=[]
for i in imglist:
b = i.split('/')[-1]
img_name.append(b)
img_name=list(set(img_name)) # 删除相同文件名
print(img_name)
for j in img_name:
print(j)
'''def get_page(url):
html = url_open(url).decode('utf-8')
a=html.find('current-comment-page')+23
b=html.find(']', a)
return html[a:b]'''
def find_imgs(url):
html = url_open(url).decode('ISO-8859-1')
'''html=url_webdriver(url,browser).decode('utf-8')'''
print(html)
img_addrs = []
print(url)
a = html.find('img data-src=')
print(a)
print('找图片')
while a != -1:
b=html.find('.png',a,a+255)
#print(b)
if b==-1:
b=html.find('.jpg',a,a+255)
if b != -1:
img_addrs.append(html[a+14:b+4])
else:
b = a+14
a=html.find('img data-src=',b)
a=html.find('img src=')
while a != -1:
b=html.find('.png',a,a+255)
#print(b)
if b==-1:
b=html.find('.jpg',a,a+255)
if b != -1:
img_addrs.append(html[a+9:b+4])
else:
b = a+9
a=html.find('img src=',b)
a=html.find('img data-lazy-img=')
while a != -1:
b=html.find('.png',a,a+255)
#print(b)
if b==-1:
b=html.find('.jpg',a,a+255)
if b != -1:
img_addrs.append(html[a+19:b+4])
else:
b = a+19
a=html.find('img data-lazy-img',b)
for each in img_addrs:
print(each)
return img_addrs
def save_imgs(folder, img_addrs):
for each in img_addrs:
filename = each.split('/')[-1]
each = 'http:'+each
print(each)
with open(filename,'wb') as f:
img = url_open(each)
f.write(img)
def download_mm(folder='OOXX'):
if os.path.exists(folder)==False:
os.mkdir(folder)
print('创建成功')
os.chdir(folder)
url='https://wt.jd.com'
'''page_num=int(get_page(url))
print(page_num)
for i in range(page):
page_num -= i
print(page_num)
page_url = url+'page-'+str(page_num)+'#comments'
print(page_url)'''
img_addrs = find_imgs(url)
save_imgs(folder,img_addrs)
if __name__ == '__main__':
#download_mm()
url='https://wt.jd.com'
browser=webdriver.Firefox(executable_path='/home/renpu/pachong/geckodriver')
url_webdriver(url,browser)
import os
import random
import time
import re
from selenium import webdriver
#browser=webdriver.Firefox(executable_path='/home/renpu/pachong/geckodriver')
def url_open(url):
'''iplist=['218.20.55.185:9999','39.104.13.145:8080']
a=random.choice(iplist)
print(a)
proxy_support=urllib.request.ProxyHandler({'http':a})
opener = urllib.request.build_opener(proxy_support)
opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0')]
urllib.request.install_opener(opener)'''
headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
req = urllib.request.Request(url=url, headers=headers)
respone = urllib.request.urlopen(req)
return respone.read()
def url_webdriver(url,browser):
browser.get(url)
html=browser.page_source.encode().decode('utf-8')
with open('jingdongweiber.txt','w') as fjingdong:
fjingdong.write(html)
img=r'(?:(?://.{3,150}?)\.jpg)'
imglist=re.findall(img,html)
img_name=[]
for i in imglist:
b = i.split('/')[-1]
img_name.append(b)
img_name=list(set(img_name)) # 删除相同文件名
print(img_name)
for j in img_name:
print(j)
'''def get_page(url):
html = url_open(url).decode('utf-8')
a=html.find('current-comment-page')+23
b=html.find(']', a)
return html[a:b]'''
def find_imgs(url):
html = url_open(url).decode('ISO-8859-1')
'''html=url_webdriver(url,browser).decode('utf-8')'''
print(html)
img_addrs = []
print(url)
a = html.find('img data-src=')
print(a)
print('找图片')
while a != -1:
b=html.find('.png',a,a+255)
#print(b)
if b==-1:
b=html.find('.jpg',a,a+255)
if b != -1:
img_addrs.append(html[a+14:b+4])
else:
b = a+14
a=html.find('img data-src=',b)
a=html.find('img src=')
while a != -1:
b=html.find('.png',a,a+255)
#print(b)
if b==-1:
b=html.find('.jpg',a,a+255)
if b != -1:
img_addrs.append(html[a+9:b+4])
else:
b = a+9
a=html.find('img src=',b)
a=html.find('img data-lazy-img=')
while a != -1:
b=html.find('.png',a,a+255)
#print(b)
if b==-1:
b=html.find('.jpg',a,a+255)
if b != -1:
img_addrs.append(html[a+19:b+4])
else:
b = a+19
a=html.find('img data-lazy-img',b)
for each in img_addrs:
print(each)
return img_addrs
def save_imgs(folder, img_addrs):
for each in img_addrs:
filename = each.split('/')[-1]
each = 'http:'+each
print(each)
with open(filename,'wb') as f:
img = url_open(each)
f.write(img)
def download_mm(folder='OOXX'):
if os.path.exists(folder)==False:
os.mkdir(folder)
print('创建成功')
os.chdir(folder)
url='https://wt.jd.com'
'''page_num=int(get_page(url))
print(page_num)
for i in range(page):
page_num -= i
print(page_num)
page_url = url+'page-'+str(page_num)+'#comments'
print(page_url)'''
img_addrs = find_imgs(url)
save_imgs(folder,img_addrs)
if __name__ == '__main__':
#download_mm()
url='https://wt.jd.com'
browser=webdriver.Firefox(executable_path='/home/renpu/pachong/geckodriver')
url_webdriver(url,browser)