chapter_Content = “”.join(“”.join(text.xpath(“.//dd[@id=‘contents’]//*/text()”)))
pattern = re.compile(‘dd id=“contents”.?>(.?)’)
match = pattern.search(ht)
chapter_Content = “”.join(match.group(1).replace(" “,”").split()) if match else “爬取错误”
‘’’
result,number = re.subn(“<.*?>”,“”,str(soup.find(“dd”,id=‘contents’)))
chapter_Content = “”.join(result.split())
print(len(chapter_Content))
novel_ID = response.url.split(“/”)[-2]
return ChapterItem(
chapter_Url = response.url,
_id=int(response.url.split(“/”)[-1].split(“.”)[0]),
novel_Name=novel_Name,
chapter_Name=chapter_Name,
chapter_Content= chapter_Content,
novel_ID = novel_ID,
is_Error = len(chapter_Content) < 3000
)
3.scrapy中实现增量式爬取的几种方式
1.缓存
通过开启缓存,将每个请求缓存至本地,下次爬取时,scrapy会优先从本地缓存中获得response,这种模式下,再次请求已爬取的网页不用从网络中获得响应,所以不受带宽影响,对服务器也不会造成额外的压力,但是无法获取网页变化的内容,速度也没有第二种方式快,而且缓存的文件会占用比较大的内存,在setting.py的以下注释用于设置缓存
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = ‘httpcache’
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage’