# -*- coding: utf-8 -*- import sys import threading import threadpool reload(sys) sys.setdefaultencoding('utf-8') import time from selenium import webdriver webdriverPath = 'D:\\test\\chromedriver.exe' savePath = 'D:\\test' class GetName(object): def getName(self): options = webdriver.ChromeOptions() options.add_argument("--user-data-dir=" + r"C:/Users/cui/AppData/Local/Google/Chrome/User Data/") driver = webdriver.Chrome(webdriverPath) try: allCityUrlList = [] driver.get('http://www.58.com/ershoufang/changecity/') time.sleep(2) cityPage = driver.find_element_by_css_selector('#clist') cityList = cityPage.find_elements_by_tag_name('dd') print len(cityList) for city in cityList: cityUrlList = city.find_elements_by_tag_name('a') len(cityUrlList) for cityUrl in cityUrlList: url = cityUrl.get_attribute('href') cityName = cityUrl.text city = {"cityName": cityName, "cityUrl": url} print city allCityUrlList.append(city) print len(allCityUrlList) # for allCityUrl in allCityUrlList: # self.crawler(allCityUrl, driver) pool = threadpool.ThreadPool(3) requests = threadpool.makeRequests(self.crawler, allCityUrlList) [pool.putRequest(req) for req in requests] pool.wait() except: print "crawler name error !" finally: driver.close() def getUrlPage(self, driver, allCityUrl): page = driver.find_element_by_css_selector( 'body > div.main-wrap > div.content-wrap > div.content-side-left > ul') infoList = page.find_elements_by_css_selector('li') len(infoList) resultList = [] for info in infoList: addressNameInfo = info.find_element_by_class_name('list-info').text priceInfo = info.find_element_by_class_name('price').text addressNameTemp = addressNameInfo.split("\n") priceTemp = priceInfo.split("\n") addressTemp = addressNameTemp[2].split("-") name = addressNameTemp[3].split("- ")[1] result = name + '- ' + allCityUrl.get('cityName') + '- ' + addressTemp[1] + '- ' + addressTemp[0] + '- ' + \ addressTemp[2] + '- ' + addressNameTemp[1] + '- ' + priceTemp[0] + '- ' + priceTemp[1] resultList.append(result) print result self.writeFile(result, allCityUrl.get('cityName')) return resultList def writeFile(self, content, cityName): file_object = open(savePath + '\\' + cityName + '.txt', 'a+') file_object.write(content) file_object.write('\n') file_object.close() def crawler(self, allCityUrl): options = webdriver.ChromeOptions() options.add_argument("--user-data-dir=" + r"C:/Users/cui/AppData/Local/Google/Chrome/User Data/") driver = webdriver.Chrome(webdriverPath) try: driver.get(allCityUrl.get("cityUrl")) url = "" nextPage = "" try: nextPage = driver.find_element_by_css_selector( 'body > div.main-wrap > div.content-wrap > div.content-side-left > div.pager > a:nth-child(4)').text url = driver.find_element_by_css_selector( 'body > div.main-wrap > div.content-wrap > div.content-side-left > div.pager > a:nth-child(4)').get_attribute( 'href') except: print "get pageNum error ! " print type(nextPage) print type(int(nextPage)) print url for i in range(1, int(nextPage)): try: url = url.split("/pn")[0] url = url + "/pn" + str(i) + "/" print url driver.get(url) time.sleep(1) urlList = self.getUrlPage(driver, allCityUrl) print len(urlList) except: print "get pageInfo error ! " except: print "one pool error !" finally: driver.close() if __name__ == "__main__": test = GetName() test.getName()
爬虫练习
最新推荐文章于 2022-09-25 20:58:22 发布