爬虫代码,备忘。 #coding=utf-8 #__author__ = chengzhipeng import re import os import sys from bs4 import BeautifulSoup from urllib import request import ssl # url = 'http://www.biqiuge.com/book/4772/' # url = 'https://www.qu.la/book/1/' url = 'http://www.biquge.com.tw/14_14055/' def getHtmlCode(url): page = request.urlopen(url) html = page.read() htmlTree = BeautifulSoup(html,'html.parser') return htmlTree #return htmlTree.prettify() def getKeyContent(url): htmlTree = getHtmlCode(url) def parserCaption(url): htmlTree = getHtmlCode(url) storyName = htmlTree.h1.get_text() + '.txt' print('小说名:',storyName) aList = htmlTree.find_all('a',href=re.compile('(\d)*.html')) #aList是一个标签类型的列表,class = Tag 写入文件之前需要转化为str #print(int(aList[1]['href'][0:-5])) print(aList) aDealList = [] for line in a
爬虫代码
最新推荐文章于 2025-03-10 12:19:36 发布