抓取搜狗图片

最新推荐文章于 2024-04-28 17:07:53 发布

zengna_com

最新推荐文章于 2024-04-28 17:07:53 发布

阅读量5.7k

点赞数 1

CC 4.0 BY-SA版权

本文链接：https://blog.youkuaiyun.com/thundor/article/details/37660291

本文介绍了一种使用Python爬虫技术从搜狗搜索中抓取图片、问题答案及视频链接的方法。针对不同的需求（如获取图片、答案或视频），文章提供了具体的函数实现，并详细展示了如何解析网页源代码来提取所需数据。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

def getImages(keyword):
    print quote(keyword)
    url = "http://pic.sogou.com/pics?ie=utf8&p=40230504&query="+quote(keyword)+"&"
    webcontent = urllib.urlopen(url).read()
    content = re.findall('imgTempData=([\w\W]+?)"};',webcontent)[0]
    hintWord = re.findall('hintWordArr = ([\w\W]+?);',webcontent)[0]
    title = re.findall('"title":"(.*?)"',content)
    thumbUrl = re.findall('"thumbUrl":"(.*?)"',content)
    pic_url = re.findall('"pic_url":"(.*?)"',content)
    
    print len(title),len(thumbUrl),len(pic_url)
    print eval(hintWord)
    
    for i in range(len(title)):
        print "save the %s image"%str(i+1)
        saveImage(i+1, decodeHtml(pic_url[i]))

def getAnswer(keyword):
    print quote(keyword)
    url="http://wenwen.sogou.com/s/?w="+quote(keyword)+"&search=%E6%90%9C%E7%B4%A2%E7%AD%94%E6%A1%88&ch=sp.sb.top"
#    webcontent = urllib.urlopen(url).read()
    request = urllib2.Request(url)  
#    request.add_header("Accept", "image/jpeg, application/x-ms-application, image/gif, application/xaml+xml, image/pjpeg, application/x-ms-xbap, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*")  
#    request.add_header("Proxy-Connection", "Keep-Alive")
#    request.add_header("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; InfoPath.3)")
    request.add_header("Cookie", "ww_sTitle=%u6885%u897F*%u5E7F%u573A%u821E%u5927%u5988%u66B4%u8D70*%u4E16%u754C%u676F; ww_filter=1; ww_search_tips=nulln; token=9B014E77FBB8EA4878606E4D47681803423EAFB9; CXID=0AA722D9B92043B4F3790F68E8ECD1D5; SUID=7326E6A761110C0A5322755500039D50; ad=0yllllllll2Fs2dzlllllVn@FYllllllIMYutZllll9llllljCxlw@@@@@@@@@@@; ssuid=4424273571; pgv_pvi=8271389696; IPLOC=US; SUV=1404872814721204395706916; suid=72407294724679934; sct=19; ld=Xkllllllll2Fsc4jlllllVn@kt6lllllIMYutZllllwllllljZlll5@@@@@@@@@@; browser_width=1259; browser_height=867; GOTO=; cid=websearch2ww; ss_cidf=1; LSTMV=278%2C62; LCLKINT=1467; MAIN_SESSIONID=n11vil8v2or4r9sdl4wvgc9xz.n11; token=9B014E77FBB8EA4878606E4D47681803423EAFB9")
    request.add_header("Referer","http://wenwen.sogou.com")
    opener = urllib2.build_opener()  
    webcontent = opener.open(request).read()
    titles = re.findall('level="\d+"> <span>([\w\W]+?)<span>',webcontent)
    summarys = re.findall('<div class="summary">([\w\W]+?)</div>',webcontent)
    print len(titles),len(summarys)

def getBaiduAnswer(keyword):
    print quote(keyword.decode("utf8").encode("cp936"))
    url= "http://zhidao.baidu.com/search?word="+quote(keyword.decode("utf8").encode("cp936"))
    webcontent = urllib.urlopen(url).read()
    title = re.findall('<dt class="dt[\w\W]+?</dt>',webcontent)
    answer = re.findall('<dd class="dd answer">([\w\W]+?)</dd>',webcontent)
    print len(title),len(answer)

def getSogouVedios(keyword):
    print quote(keyword)
    url = "http://v.sogou.com/v?ie=utf8&p=40230608&query="+quote(keyword)+"&"
    webcontent = urllib.urlopen(url).read()
    thumbs = re.findall("\(\'normal\', \'([\w\W]+?)\'",webcontent)
    links = re.findall("link: ([\w\W]+?)<br>",webcontent)
    titles = re.findall("<!--resultTitle:([\w\W]+?)-->",webcontent)
    tags = re.findall("<!--tag:([\w\W]+?)-->",webcontent)
    sources = re.findall('<span class="shade_bar_lft">([\w\W]+?)</span>',webcontent)
    lengths = re.findall('<span class="shade_bar_rgt">([\w\W]+?)</span>',webcontent)
    print len(thumbs)
    
def get360Vedios(keyword):
    url = "http://video.so.com/v?q="+quote(keyword)
    webcontent = urllib.urlopen(url).read()
    webcontent = re.findall("<div class='b-shortvideo'>([\w\W]+?)<div class='p-side'>",webcontent)[0]
    thumbs = re.findall("data-src='(.*?)'",webcontent)
    sources = re.findall('<span class="w-figure-lefthint">([\w\W]+?)</span>',webcontent)
    lengths = re.findall('<span class="w-figure-righthint">([\w\W]+?)</span>',webcontent)
    links = re.findall("<a class='w-figure-title' href='([\w\W]+?)'",webcontent)
    titles = re.findall("<h4>(.*?)</h4>",webcontent)
    print len(thumbs),len(sources),len(lengths),len(links),len(titles)