I learned scrapy these days. And wrote something out from it.
To grasp the courses info on icourse.com.(in fact, I tried to grasp something else, and I found lots of pages are js in it)
Not a complete experient though, I still haven't finished the scheduler...
Here are the codes:
Imooc.py
import codecs import scrapy from bs4 import BeautifulSoup from Imooc.items import ImoocItem def PrintSoup(soup): file = codecs.open('soup.txt', 'w+', 'utf-8') file.write(soup.prettify()) file.close() class Imooc(scrapy.Spider): #---Spider info--- name = 'Imooc' allowed_domains = ['http://www.imooc.com'] start_urls = ['http://www.imooc.com/course/list'] #---parsing part--- def parse(self, response): html = response.body soup = BeautifulSoup(html, 'html.parser') #PrintSoup(soup) tot = 0 #file = codecs.open('div.txt', 'w+') for div in soup.find_all(attrs={'class' : 'course-card-container'}): item = ImoocItem() tmp = div.find(attrs={'class' : 'course-label'}) tnp = tmp.find_all('label') item['index'] = '' for i in range(len(tnp)): item['index'] += tnp[i].string if i != len(tnp) - 1: item['index'] += ' ' tmp = div.find('h3') item['name'] = tmp.string tmp = div.find_all('span') item['number'] = tmp[1].get_text() tot += 1 #file.write(div.prettify()) #file.write(item) yield item print(tot) #file.close() #This works perfect on scrapy #Make something out finally...
settings.py
ITEM_PIPELINES = { 'Imooc.pipelines.ImoocPipeline': 1, }
piplines.py
import json import codecs from pymongo import MongoClient class ImoocPipeline(object): def __init__(self): self.file = codecs.open('data.json', 'w', encoding='utf-8') self.col = MongoClient('localhost', 27017).Mooc.Mooc def process_item(self, item, spider): #Caution: remember to convert dict() #Caution: json.dumps with a parameter ensure_ascii self.file.write(json.dumps(dict(item), ensure_ascii=False) + '\n') self.col.insert(dict(item)) return item
The results in mongodb: