from selenium import webdriver
from lxml import etree
import pymongo
conn = pymongo.MongoClient('localhost',27017)
db = conn.wangyi
table = db.wang
def save_data(dic):
table.insert_one(dic)
def save_wangyi(url):
broswer = webdriver.Chrome('./chromedriver.exe')
broswer.get(url=url)
html = broswer.page_source
tree = etree.HTML(html)
title = ''.join(tree.xpath('//ul[@class="newsdata_list fixed_bar_padding noloading"]/li//h3/a/text()'))
dic = {'title':title}
save_data(dic)
if __name__ == '__main__':
url = 'https://news.163.com/domestic/'
save_wangyi(url)
import scrapy
from selenium import webdriver
from ..items import WyxwItem
class WySpider(scrapy.Spider):
name = 'wy'
start_urls = ['https://news.163.com/domestic']
broswer = webdriver.Chrome('./chromedriver.exe')
def parse(self, response):
div_list = response.xpath('//div[@class="ndi_main"]/div')
for div in div_list:
title = div.xpath('.//h3/a/text()').extract_first()
item = WyxwItem()
item['title'] = title
yield item
import pymongo
class WyxwPipeline(object):
conn = pymongo.MongoClient()
db = conn.wynews
table = db.news
def process_item(self, item, spider):
self.table.insert_one(dict(item))
return item
执行代码前需要将settings配置文件中的下载器和item管道打开注释