def getImages(keyword):
print quote(keyword)
url = "http://pic.sogou.com/pics?ie=utf8&p=40230504&query="+quote(keyword)+"&"
webcontent = urllib.urlopen(url).read()
content = re.findall('imgTempData=([\w\W]+?)"};',webcontent)[0]
hintWord = re.findall('hintWordArr = ([\w\W]+?);',webcontent)[0]
title = re.findall('"title":"(.*?)"',content)
thumbUrl = re.findall('"thumbUrl":"(.*?)"',content)
pic_url = re.findall('"pic_url":"(.*?)"',content)
print len(title),len(thumbUrl),len(pic_url)
print eval(hintWord)
for i in range(len(title)):
print "save the %s image"%str(i+1)
saveImage(i+1, decodeHtml(pic_url[i]))
def getAnswer(keyword):
print quote(keyword)
url="http://wenwen.sogou.com/s/?w="+quote(keyword)+"&search=%E6%90%9C%E7%B4%A2%E7%AD%94%E6%A1%88&ch=sp.sb.top"
# webcontent = urllib.urlopen(url).read()
request = urllib2.Request(url)
# request.add_header("Accept", "image/jpeg, application/x-ms-application, image/gif, application/xaml+xml, image/pjpeg, application/x-ms-xbap, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*")
# request.add_header("Proxy-Connection", "Keep-Alive")
# request.add_header("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; InfoPath.3)")
request.add_header("Cookie", "ww_sTitle=%u6885%u897F*%u5E7F%u573A%u821E%u5927%u5988%u66B4%u8D70*%u4E16%u754C%u676F; ww_filter=1; ww_search_tips=nulln; token=9B014E77FBB8EA4878606E4D47681803423EAFB9; CXID=0AA722D9B92043B4F3790F68E8ECD1D5; SUID=7326E6A761110C0A5322755500039D50; ad=0yllllllll2Fs2dzlllllVn@FYllllllIMYutZllll9llllljCxlw@@@@@@@@@@@; ssuid=4424273571; pgv_pvi=8271389696; IPLOC=US; SUV=1404872814721204395706916; suid=72407294724679934; sct=19; ld=Xkllllllll2Fsc4jlllllVn@kt6lllllIMYutZllllwllllljZlll5@@@@@@@@@@; browser_width=1259; browser_height=867; GOTO=; cid=websearch2ww; ss_cidf=1; LSTMV=278%2C62; LCLKINT=1467; MAIN_SESSIONID=n11vil8v2or4r9sdl4wvgc9xz.n11; token=9B014E77FBB8EA4878606E4D47681803423EAFB9")
request.add_header("Referer","http://wenwen.sogou.com")
opener = urllib2.build_opener()
webcontent = opener.open(request).read()
titles = re.findall('level="\d+"> <span>([\w\W]+?)<span>',webcontent)
summarys = re.findall('<div class="summary">([\w\W]+?)</div>',webcontent)
print len(titles),len(summarys)
def getBaiduAnswer(keyword):
print quote(keyword.decode("utf8").encode("cp936"))
url= "http://zhidao.baidu.com/search?word="+quote(keyword.decode("utf8").encode("cp936"))
webcontent = urllib.urlopen(url).read()
title = re.findall('<dt class="dt[\w\W]+?</dt>',webcontent)
answer = re.findall('<dd class="dd answer">([\w\W]+?)</dd>',webcontent)
print len(title),len(answer)
def getSogouVedios(keyword):
print quote(keyword)
url = "http://v.sogou.com/v?ie=utf8&p=40230608&query="+quote(keyword)+"&"
webcontent = urllib.urlopen(url).read()
thumbs = re.findall("\(\'normal\', \'([\w\W]+?)\'",webcontent)
links = re.findall("link: ([\w\W]+?)<br>",webcontent)
titles = re.findall("<!--resultTitle:([\w\W]+?)-->",webcontent)
tags = re.findall("<!--tag:([\w\W]+?)-->",webcontent)
sources = re.findall('<span class="shade_bar_lft">([\w\W]+?)</span>',webcontent)
lengths = re.findall('<span class="shade_bar_rgt">([\w\W]+?)</span>',webcontent)
print len(thumbs)
def get360Vedios(keyword):
url = "http://video.so.com/v?q="+quote(keyword)
webcontent = urllib.urlopen(url).read()
webcontent = re.findall("<div class='b-shortvideo'>([\w\W]+?)<div class='p-side'>",webcontent)[0]
thumbs = re.findall("data-src='(.*?)'",webcontent)
sources = re.findall('<span class="w-figure-lefthint">([\w\W]+?)</span>',webcontent)
lengths = re.findall('<span class="w-figure-righthint">([\w\W]+?)</span>',webcontent)
links = re.findall("<a class='w-figure-title' href='([\w\W]+?)'",webcontent)
titles = re.findall("<h4>(.*?)</h4>",webcontent)
print len(thumbs),len(sources),len(lengths),len(links),len(titles)