爬虫

# -*- coding: utf-8 -*-
# @Time : 2019/5/31 19:33
# @Author : zejin
# @File : pachong.py

from urllib import request
import re

class Analysis():
url = 'https://book.douban.com/'
root_patten = '<div class="cover">([\s\S]*?)</div>'
name_patten = 'alt="([\s\S]*?)">'
adress_patten = 'href="([\s\S]*?)" title'

def __face_connect(self):
r = request.urlopen(self.url)
htmls = r.read()
htmls = str(htmls, encoding='utf-8')
return htmls

def __analysis(self,htmls):
root_htmls = re.findall(self.root_patten, htmls)
# print(root_htmls)
ancors = []
for html in root_htmls:
name = re.findall(self.name_patten, html)
adress = re.findall(self.adress_patten, html)
ancor = {"name":name, "adress":adress}
ancors.append(ancor)
# print(ancors)
return ancors

def __refine(self, ancors):
pass

def go(self):
htmls = self.__face_connect()
ancors = self.__analysis(htmls)
# self.__refine(ancors)
# ancors = self.__refine(ancors)
print(ancors)

analysis = Analysis()
analysis.go()

转载于:https://www.cnblogs.com/jinbaobao/p/10959606.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值