def main(): inputfile = "new.html" outputfile = "new-url.txt" htmlLines = getHtmlLines(inputfile) imageUrls = extractImageUrls(htmlLines) showResults(imageUrls) saveResults(outputfile,imageUrls) def getHtmlLines(htmlpath): f = open(htmlpath,"r",encoding='utf-8') ls = f.readlines() f.close return ls def extractImageUrls(htmllist): urls = [] for line in htmllist: url = line.split('src=')[-1].split('""')[1] if 'http'in url: urls.append(url) return urls def showResults(urls): count = 0 for url in urls: print('第{:2}个url:{}'.format(count.url)) count += 1 def saveResults(filepath,urls): f = open(filepath,"w") for url in urls: f.write(url+"\n") f.close() main()
web页面元素提取
最新推荐文章于 2024-12-23 17:29:54 发布