# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# https://doc.scrapy.org/en/latest/topics/items.htmlimport scrapy
classMaoyanItem(scrapy.Item):#********** Begin **********#
name = scrapy.Field()
starts = scrapy.Field()
releasetime = scrapy.Field()
score = scrapy.Field()#********** End **********#
- `step1/maoyan/maoyan/pipelines.py`
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport pymysql
from maoyan import settings
classMaoyanPipeline(object):defprocess_item(self, item, spider):#********** Begin **********##1.连接数据库
connection = pymysql.connect(
host='localhost',# 连接的是本地数据库
port=3306,#数据库端口名
user='root',# 自己的mysql用户名
passwd='123123',# 自己的密码
db='mydb',# 数据库的名字
charset='utf8',# 默认的编码方式)#2.建表、给表插入数据,完成后关闭数据库连接,return返回item
name = item['name']
starts = item['starts']
releasetime = item['releasetime']
score = item['score']try:with connection.cursor()as cursor:
sql1 ='Create Table If Not Exists mymovies(name varchar(50) CHARACTER SET utf8 NOT NULL,starts text CHARACTER SET utf8 NOT NULL,releasetime varchar(50) CHARACTER SET utf8 DEFAULT NULL,score varchar(20) CHARACTER SET utf8 NOT NULL,PRIMARY KEY(name))'# 单章小说的写入
sql2 ='Insert into mymovies values (\'%s\',\'%s\',\'%s\',\'%s\')'%(name, starts, releasetime, score)
cursor.execute(sql1)
cursor.execute(sql2)# 提交本次插入的记录
connection.commit()finally:# 关闭连接
connection.close()return item
#********** End **********#
- `step1/maoyan/maoyan/spiders/movies.py`
# -*- coding: utf-8 -*-import scrapy
from maoyan.items import MaoyanItem
classMoviesSpider(scrapy.Spider):
name ='movies'
allowed_domains =['127.0.0.1']
offset =0
url ="http://127.0.0.1:8080/board/4?offset="#********** Begin **********##1.对url进行定制,为翻页做准备
start_urls =[url +str(offset)]#2.定义爬虫函数parse()defparse(self, response):
item = MaoyanItem()
movies = response.xpath("//div[ @class ='board-item-content']")for each in movies:#电影名
name = each.xpath(".//div/p/a/text()").extract()[0]#主演明星
starts = each.xpath(".//div[1]/p/text()").extract()[0]#上映时间
releasetime = each.xpath(".//div[1]/p[3]/text()").extract()[0]
score1 = each.xpath(".//div[2]/p/i[1]/text()").extract()[0]
score2 = each.xpath(".//div[2]/p/i[2]/text()").extract()[0]#评分
score = score1 + score2
item['name']= name
item['starts']= starts
item['releasetime']= releasetime
item['score']= score
yield item
#3.在函数的最后offset自加10,然后重新发出请求实现翻页功能if self.offset <90:
self.offset +=10yield scrapy.Request(self.url+str(self.offset), callback=self.parse)#********** End **********#
# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# https://doc.scrapy.org/en/latest/topics/items.htmlimport scrapy
#存放全部小说信息classNovelprojectItem(scrapy.Item):#********** Begin **********#
name = scrapy.Field()
author = scrapy.Field()
state = scrapy.Field()
description = scrapy.Field()#********** End **********##单独存放小说章节classNovelprojectItem2(scrapy.Item):#********** Begin **********#
tablename = scrapy.Field()# 表命名时需要
title = scrapy.Field()#********** End **********#
- `step2/NovelProject/NovelProject/pipelines.py`
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport pymysql
from NovelProject.items import NovelprojectItem,NovelprojectItem2
classNovelprojectPipeline(object):defprocess_item(self, item, spider):#********** Begin **********##1.和本地的数据库mydb建立连接
connection = pymysql.connect(
host='localhost',# 连接的是本地数据库
port =3306,# 端口号
user='root',# 自己的mysql用户名
passwd='123123',# 自己的密码
db='mydb',# 数据库的名字
charset='utf8',# 默认的编码方式:)#2.处理来自NovelprojectItem的item(处理完成后return返回item)ifisinstance(item, NovelprojectItem):# 从items里取出数据
name = item['name']
author = item['author']
state = item['state']
description = item['description']try:with connection.cursor()as cursor:# 小说信息写入
sql1 ='Create Table If Not Exists novel(name varchar(20) CHARACTER SET utf8 NOT NULL,author varchar(10) CHARACTER SET utf8,state varchar(20) CHARACTER SET utf8,description text CHARACTER SET utf8,PRIMARY KEY (name))'
sql2 ='Insert into novel values (\'%s\',\'%s\',\'%s\',\'%s\')'%(name, author, state, description)
cursor.execute(sql1)
cursor.execute(sql2)# 提交本次插入的记录
connection.commit()finally:# 关闭连接
connection.close()return item
#3.处理来自NovelprojectItem2的item(处理完成后return返回item)elifisinstance(item, NovelprojectItem2):
tablename = item['tablename']
title = item['title']try:with connection.cursor()as cursor:# 小说章节的写入
sql3 ='Create Table If Not Exists %s(title varchar(20) CHARACTER SET utf8 NOT NULL,PRIMARY KEY (title))'% tablename
sql4 ='Insert into %s values (\'%s\')'%(tablename, title)
cursor.execute(sql3)
cursor.execute(sql4)
connection.commit()finally:
connection.close()return item
#********** End **********#