考虑代码量较长,注释部分含在代码内部
可以实现抓取小说的名字,作者,封面,所有章节的信息
(0)先看效果
(1)数据库设计
from django.db import models
from django.contrib.auth.models import AbstractUser
from django.db.models.signals import pre_delete
from django.dispatch import receiver
# 小说列表
class NovelList(models.Model):
nid = models.AutoField(primary_key=True)
name = models.CharField(verbose_name='小说标题', max_length=15)
url = models.FileField(verbose_name='小说封面地址', upload_to='novel./', default='')
author = models.CharField(verbose_name='小说作者', max_length=20, default="")
introduce = models.CharField(verbose_name='小说介绍', max_length=300, default="")
pages = models.IntegerField(verbose_name='小说章节数', default=0)
class Meta:
verbose_name_plural = '小说'
# 小说章节
class NovelContent(models.Model):
nid = models.AutoField(primary_key=True)
content = models.CharField(verbose_name='章节内容', max_length=8000)
# 所属小说id
novel = models.ForeignKey(verbose_name='小说id', to='NovelList', to_field='nid', on_delete=models.CASCADE)
title = models.CharField(verbose_name='章节标题', max_length=25)
# 上一章id
pre_chapter_id = models.IntegerField(verbose_name='上一章章节id', default=0)
# 下一章id
then_chapter_id = models.IntegerField(verbose_name='下一章章节id', default=0)
class Meta:
verbose_name_plural = '小说内容'
(2)抓取小说主函数
import re
import requests
from lxml import etree
import os
import time
if __name__ == '__main__':
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'djangoProject.settings')
import django
import random
django.setup()
from app import models
from django.core.files import File
from django.core.files.base import ContentFile
django.setup()
# 获得html文章
def get_html(url):
user_agent = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET