Python爬取虎扑步行街,爆照区中的照片

使用的是Python3+reqeusts
源码如下
更新于2017年9月16日


#encoding:gbk
import requests
import re
import time


def get_pages(url):#拼凑每一页的url
    pages = []
    for i in range(1,5,1):#爬取第1到50页的帖子
        new_url=url+'-'+str(i)
        pages.append(new_url)
    return pages

def get_links(url): #得到每一页url下面每个帖子的url
    print("pageUrl\t"+url)
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = { 'User-Agent' : user_agent }
    mtext = requests.get(url,headers = headers)
    mtext.encoding='utf-8'
    basic_url = 'https://bbs.hupu.com'
    #print("html\t"+mtext.text)
    pattern = '<a  href="(/\d+.html)'#帖子在html源码的正则表达式
    tie_items = re.findall(pattern,mtext.text,re.S)
    print("tie_items"+str(tie_items))
    link_tie = []
    for item in tie_items:
        link_tie.append(basic_url+item)
    return link_tie

def get_img(url,tie_id):#帖子url和帖子的id,以便回溯
    file_path = 'E:/bxj1' #图片存放路径
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = { 'User-Agent' : user_agent }
    mtext = requests.get(url,headers = headers)
    mtext.encoding='utf-8'
    pattern = 'https://i10.hoopchina.com.cn/.+?jpeg'#帖子html源码中图片的正则
    items = re.findall(pattern,mtext.text,re.S)
    n = 1
    for link in items:
        filename = file_path+"/"+tie_id+"-"+str(n)+".jpg"
        mimg = requests.get(link, stream=True)
        n=n+1
        open(filename, 'wb').write(mimg.content)
        print(filename,'is download')



bxj = 'https://bbs.hupu.com/selfie'
alllinks = [] #具体每个帖子的url
page_list = get_pages(bxj) #每一页的url
#将所有页面中的所有帖子链接添加到alllinks列表中
for page in page_list:
    link_ties = get_links(page)
    for link_tie in link_ties:
        alllinks.append(link_tie)


print("alllinks"+str(alllinks))
#访问每一个帖子
for img_link in alllinks:
    id= list(img_link.split('/')[-1].split('.'[-1]))[0] #每个帖子的ID
    get_img(img_link,id) #download 图片
    time.sleep(2)










控制台打印如下

C:\Users\Administrator\AppData\Local\Programs\Python\Python35\python.exe D:/py_project/bxj.py
page_list ['https://bbs.hupu.com/selfie-1', 'https://bbs.hupu.com/selfie-2', 'https://bbs.hupu.com/selfie-3', 'https://bbs.hupu.com/selfie-4', 'https://bbs.hupu.com/selfie-5', 'https://bbs.hupu.com/selfie-6', 'https://bbs.hupu.com/selfie-7', 'https://bbs.hupu.com/selfie-8', 'https://bbs.hupu.com/selfie-9', 'https://bbs.hupu.com/selfie-10', 'https://bbs.hupu.com/selfie-11', 'https://bbs.hupu.com/selfie-12', 'https://bbs.hupu.com/selfie-13', 'https://bbs.hupu.com/selfie-14', 'https://bbs.hupu.com/selfie-15', 'https://bbs.hupu.com/selfie-16', 'https://bbs.hupu.com/selfie-17', 'https://bbs.hupu.com/selfie-18', 'https://bbs.hupu.com/selfie-19', 'https://bbs.hupu.com/selfie-20', 'https://bbs.hupu.com/selfie-21', 'https://bbs.hupu.com/selfie-22', 'https://bbs.hupu.com/selfie-23', 'https://bbs.hupu.com/selfie-24', 'https://bbs.hupu.com/selfie-25', 'https://bbs.hupu.com/selfie-26', 'https://bbs.hupu.com/selfie-27', 'https://bbs.hupu.com/selfie-28', 'https://bbs.hupu.com/selfie-29', 'https://bbs.hupu.com/selfie-30', 'https://bbs.hupu.com/selfie-31', 'https://bbs.hupu.com/selfie-32', 'https://bbs.hupu.com/selfie-33', 'https://bbs.hupu.com/selfie-34', 'https://bbs.hupu.com/selfie-35', 'https://bbs.hupu.com/selfie-36', 'https://bbs.hupu.com/selfie-37', 'https://bbs.hupu.com/selfie-38', 'https://bbs.hupu.com/selfie-39', 'https://bbs.hupu.com/selfie-40', 'https://bbs.hupu.com/selfie-41', 'https://bbs.hupu.com/selfie-42', 'https://bbs.hupu.com/selfie-43', 'https://bbs.hupu.com/selfie-44', 'https://bbs.hupu.com/selfie-45', 'https://bbs.hupu.com/selfie-46', 'https://bbs.hupu.com/selfie-47', 'https://bbs.hupu.com/selfie-48', 'https://bbs.hupu.com/selfie-49']
tie_items ['/14954995.html', '/20041029.html', '/20040708.html', '/20040548.html', '/20041549.html', '/20041614.html', '/20039151.html', '/20040984.html', '/20038352.html', '/20013344.html', '/20036573.html', '/20040754.html', '/19890076.html', '/19895152.html', '/20034056.html', '/19990039.html', '/20039679.html', '/20028276.html', '/20032332.html', '/20019927.html', '/20037582.html', '/20038324.html', '/20038982.html', '/20039949.html', '/20035537.html', '/19661088.html', '/20036540.html', '/20040250.html', '/19939316.html', '/20030362.html', '/20039819.html', '/20037245.html', '/20005113.html', '/19680445.html', '/19989175.html', '/20037972.html', '/20036510.html', '/20037832.html', '/20038652.html', '/20038500.html', '/19983549.html', '/19591641.html', '/20040509.html', '/19989165.html', '/20039320.html', '/19994787.html', '/20024524.html', '/20036204.html', '/20040774.html', '/20040196.html', '/19780400.html']
tie_items ['/20039478.html', '/20021185.html', '/20031214.html', '/20030476.html', '/19934730.html', '/20038386.html', '/20032500.html', '/20034757.html', '/20013709.html', '/20038654.html', '/20021044.html', '/20038876.html', '/20027685.html', '/20007956.html', '/20037668.html', '/19988261.html', '/20026195.html', '/19877898.html', '/20040299.html', '/19887587.html', '/20038474.html', '/20037996.html', '/20037314.html', '/20037285.html', '/19986729.html', '/20018093.html', '/19775823.html', '/20039161.html', '/20035851.html', '/20037691.html', '/20033756.html', '/20037113.html', '/20036186.html', '/20032295.html', '/20020954.html', '/20036905.html', '/20037747.html', '/20032639.html', '/20039079.html', '/20037255.html', '/19991391.html', '/20035635.html', '/20033386.html', '/20028516.html', '/20032022.html', '/20031486.html', '/19894097.html', '/20023121.html', '/20032656.html']
tie_items ['/20037248.html', '/20036547.html', '/20037709.html', '/20029517.html', '/19912404.html', '/20034363.html', '/20033257.html', '/20003561.html', '/20021352.html', '/20018753.html', '/19952882.html', '/20025761.html', '/20022075.html', '/20014741.html', '/20030473.html', '/20024464.html', '/20035903.html', '/20019598.html', '/20035949.html', '/20016980.html', '/20033240.html', '/20032903.html', '/20034394.html', '/20026223.html', '/20033465.html', '/20026491.html', '/20024456.html', '/20024780.html', '/20030126.html', '/20031776.html', '/19984788.html', '/20031373.html', '/20022349.html', '/20035153.html', '/20012615.html', '/20032445.html', '/20035245.html', '/20027519.html', '/20030875.html', '/20010224.html', '/20000420.html', '/20024751.html', '/19897907.html', '/20008848.html', '/20025787.html', '/20015131.html', '/19987418.html', '/20032214.html', '/20020226.html']
tie_items ['/20019306.html', '/19794657.html', '/20017983.html', '/20033759.html', '/20029002.html', '/20016788.html', '/20029594.html', '/20032875.html', '/20015485.html', '/20028330.html', '/20032513.html', '/20027060.html', '/20030070.html', '/20019676.html', '/20014723.html', '/20019117.html', '/20010068.html', '/20017120.html', '/20027222.html', '/20013117.html', '/20027070.html', '/20010552.html', '/20008103.html', '/20026039.html', '/20028406.html', '/20023584.html', '/20031056.html', '/20027338.html', '/20029565.html', '/20026380.html', '/20026435.html', '/20007415.html', '/20024140.html', '/20030676.html', '/20024108.html', '/20025124.html', '/19929863.html', '/20029659.html', '/20009688.html', '/20025284.html', '/20028703.html', '/20010925.html', '/20012181.html', '/20019083.html', '/20018264.html', '/20028757.html', '/20029590.html', '/20018127.html', '/18796306.html']
tie_items ['/20013221.html', '/20026440.html', '/20028244.html', '/20021708.html', '/20011593.html', '/19892473.html', '/19639146.html', '/19983383.html', '/20028615.html', '/20006829.html', '/20000164.html', '/20020642.html', '/20002972.html', '/19855236.html', '/20028234.html', '/20028292.html', '/20023320.html', '/20018336.html', '/20007585.html', '/20020595.html', '/19914046.html', '/19996720.html', '/20021937.html', '/20023159.html', '/19985722.html', '/19970030.html', '/20011163.html', '/20006570.html', '/20021198.html', '/20006730.html', '/20018567.html', '/20025354.html', '/20019899.html', '/20022878.html', '/19745940.html', '/20021606.html', '/20020500.html', '/20019109.html', '/19823360.html', '/20021959.html', '/19823966.html', '/19955689.html', '/19987194.html', '/20016748.html', '/20022225.html', '/19972385.html', '/20021579.html', '/20018142.html', '/19987960.html']
tie_items ['/20018710.html', '/20013650.html', '/20017913.html', '/20020838.html', '/20022940.html', '/20015195.html', '/20020686.html', '/20019889.html', '/20021428.html', '/20002371.html', '/20000828.html', '/19880307.html', '/20015372.html', '/20017612.html', '/20020628.html', '/20016948.html', '/20015074.html', '/20010306.html', '/20017131.html', '/20010559.html', '/20002170.html', '/20015404.html', '/20010713.html', '/20020530.html', '/20009959.html', '/20017975.html', '/20016775.html', '/20011400.html', '/20014740.html', '/20019882.html', '/20017430.html', '/19980998.html', '/20017803.html', '/19819136.html', '/19946593.html', '/20004571.html', '/19965972.html', '/19870259.html', '/20016027.html', '/20011153.html', '/20015785.html', '/19998627.html', '/20017282.html', '/20019528.html', '/20011427.html', '/20014198.html', '/20002370.html', '/20014525.html', '/20009340.html']
tie_items ['/19995050.html', '/20001033.html', '/19981903.html', '/19996251.html', '/19986495.html', '/20006241.html', '/19988726.html', '/20004640.html', '/20016953.html', '/20017064.html', '/19991449.html', '/19933376.html', '/19970577.html', '/20009424.html', '/20006799.html', '/20008942.html', '/19897049.html', '/20009583.html', '/19935201.html', '/19979742.html', '/20008525.html', '/19982718.html', '/20002812.html', '/20007605.html', '/19440328.html', '/19979687.html', '/19994730.html', '/20009119.html', '/20001185.html', '/20003970.html', '/20005009.html', '/19687716.html', '/20003412.html', '/20004418.html', '/19988333.html', '/20002417.html', '/20004646.html', '/19977471.html', '/19994171.html', '/20004367.html', '/20010422.html', '/20004995.html', '/20011966.html', '/20007947.html', '/20005448.html', '/19989846.html', '/20007579.html', '/20005234.html', '/20008077.html']
tie_items ['/19973656.html', '/15494409.html', '/20012594.html', '/19984413.html', '/20007441.html', '/20006820.html', '/20010999.html', '/20011611.html', '/19996331.html', '/20010719.html', '/20011655.html', '/20006865.html', '/20005495.html', '/20007296.html', '/20005398.html', '/19691574.html', '/19755441.html', '/20007376.html', '/19970333.html', '/20010469.html', '/19969166.html', '/19980947.html', '/20002146.html', '/19896113.html', '/20004578.html', '/20002415.html', '/20007445.html', '/19983113.html', '/20005185.html', '/20000964.html', '/20004666.html', '/20000439.html', '/20005447.html', '/20004117.html', '/19997640.html', '/19995463.html', '/19815623.html', '/19249566.html', '/19984861.html', '/20001760.html', '/19995330.html', '/20004249.html', '/20004289.html', '/19987075.html', '/20006715.html', '/20001580.html', '/19998934.html', '/20005021.html']
tie_items ['/19983066.html', '/19990648.html', '/19987888.html', '/20001128.html', '/20005420.html', '/20003626.html', '/19993504.html', '/20004874.html', '/20000069.html', '/20004253.html', '/19995712.html', '/20000356.html', '/20004931.html', '/19992067.html', '/19985280.html', '/19994663.html', '/20002991.html', '/20003358.html', '/20004327.html', '/19991763.html', '/19988361.html', '/19878955.html', '/19985946.html', '/19981462.html', '/19991034.html', '/19999882.html', '/19971217.html', '/19995504.html', '/19996935.html', '/20004656.html', '/19947691.html', '/19992949.html', '/19996478.html', '/20002378.html', '/19989935.html', '/19994425.html', '/19991735.html', '/20001783.html', '/19995486.html', '/20001986.html', '/20001443.html', '/19988022.html', '/19995118.html', '/19984147.html', '/20000870.html', '/19962931.html', '/20001834.html', '/19992511.html', '/19972861.html']
tie_items ['/19993678.html', '/19928036.html', '/19956432.html', '/19966175.html', '/19975168.html', '/19997871.html', '/18344339.html', '/19930773.html', '/19996438.html', '/19965160.html', '/19999635.html', '/19990734.html', '/19994416.html', '/19998316.html', '/19372885.html', '/19995289.html', '/19997159.html', '/19997516.html', '/19752914.html', '/19990496.html', '/19980916.html', '/19993162.html', '/20000538.html', '/20000235.html', '/19995570.html', '/19997563.html', '/19903347.html', '/19989350.html', '/19987741.html', '/19991631.html', '/19998699.html', '/19992688.html', '/19975374.html', '/19993898.html', '/19996918.html', '/19994060.html'
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值