爬下厨房网页,并且自带翻页功能~~
还能把内容录入到文件里面喔~
废话不多说,上代码:
#get the information(recipes and the chief's name included)of some delicious dishes
#and save the information in a file
from lxml import html
from time import sleep
import os
ls = os.linesep
filename = "OutputFile.txt"
if os.path.exists(filename):
print "ERROR,'%s' already exists!Please name the file again~" % filename
fobj = open(filename, 'w')
writeline = "get the information of some delicious dishes"+ls
x = html.parse('http://www.xiachufang.com/explore')
titles = x.xpath("//ul[@class='list']/li/div/div/p[@class='name']/a/text()")
cook = x.xpath("//ul[@class='list']/li/div/div/p[@class='author']/a/text()")
status = x.xpath("//ul[@class='list']/li/div/div/p[@class='stats green-font']/span/text()")
material = x.xpath("//ul[@class='list']/li/div/div/p[@class='ing ellipsis']/text()")
writeline += "We got %s titles with its chief name and status. Here are the top 5:" % len(titles)+ls
i = 0
j = 5
for title in titles:
if i<j:
writeline += " >"+title+ls
writeline += " >>chief:"+cook[i]+ls
writeline += " >>>has been cooked:"+status[i]+"times"+ls
writeline += " >>>>material:"+material[i]+ls
writeline += "**********************************************************"+ls
i = i + 1
else:
break
#function:next page
#assume that 50 titles are enough,
writeline += ls+ls+"*********************function: searching next pages***********************"+ls
next_button_xpath = "//a[@class='next']/@href"
headline_xpath = "//ul[@class='list']/li/div/div/p[@class='name']/a/text()"
newTitles = []
base_url = 'http://www.xiachufang.com/{}'
next_page = 'http://www.xiachufang.com/explore'
threshold = 50
while len(newTitles) < threshold and next_page:
x = html.parse(next_page)
headlines = x.xpath(headline_xpath)
writeline += "Retrieved {} titles from url: {}".format(len(headlines), next_page)+ls
newTitles += headlines
next_pages = x.xpath(next_button_xpath)
if next_pages:
next_page = base_url.format(next_pages[0])
else:
writeline += "No next button found"+ls
next_page = None
sleep(3)
if len(newTitles)>=threshold:
writeline += "the number of titles is:%s,enough information!" % len(newTitles)+ls
with open(filename, 'wb') as out:
out.write(writeline.encode('utf-8'))
fobj.close()
print 'Done!Tada!!'