代理和css解析库
-
获取代理ip
import requests # 获取代理 def get_proxy_ips(): api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=5&expiryDate=0&format=2&newLine=3' res = requests.get(api) # print(res.text) if res.status_code == 200: if res.text[0] == '{': print('获取代理失败,提取频繁') else: return res.text.split('\n')[:-1] else: print('代理获取失败') # 解析数据 def get_net_data(): url = 'https://movie.douban.com/top250' headers = { 'User-Agent': 'Mozilla/5.0(Windows NT 6.1;Win64;x64) AppleWebKit/537.36(KHTML, likeGecko) Chrome/81.0.4044.129Safari/537.36' } # 代理 ips = get_proxy_ips() if ips: proxies = { 'http': ips[0], # 'http': 'ip地址:端口号' 'https': ips[1] } res = requests.get(url, headers=headers, proxies=proxies, timeout=2) if res.status_code == 200: print(res.text) else: print('请求失败') else: print('没有成功取到代理') if __name__ == '__main__': get_proxy_ips() get_net_data()
-
使用代理的程序优化
import requests import time def get_proxy_ips(): api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=5&expiryDate=0&format=2&newLine=3' res = requests.get(api) if res.status_code == 200: if res.text[0] == '{': print('获取代理失败,提取频繁') else: return res.text.split('\n')[:-1] else: print('代理获取失败') def get_net_data(url): headers ={ 'User-Agent': 'Mozilla/5.0(Windows NT 6.1;Win64;x64) AppleWebKit/537.36(KHTML, likeGecko) Chrome/81.0.4044.129Safari/537.36' } while True: ips = get_proxy_ips() if not ips: print('ip获取失败') time.sleep(1) continue # 取到后将包含5个ip的列表转换成一个 ips = iter(ips) for ip in ips: proxies = { 'http': ip, # 'http': 'ip地址:端口号' 'https': ip } try: res = requests.get(url, headers=headers, proxies=proxies, timeout=2) if res.status_code == 200: print(res.text) return res.text else: print('ip请求失败!') except requests.exceptions.ConnectTimeout: print('超时继续获取') if __name__ == '__main__': get_net_data('https://lib.swpu.edu.cn/')
-
bs4的使用
import requests from bs4 import BeautifulSoup def get_net_data(url): headers = { 'User-Agent': 'Mozilla/5.0(Windows NT 6.1;Win64;x64) AppleWebKit/537.36(KHTML, likeGecko) Chrome/81.0.4044.129Safari/537.36' } response = requests.get(url) if response.status_code == 200: return response.text else: print('失败') def analysis_data(data: str): # 1.创建解析器对象 # BeautifulSoup(需要解析的html字符串,'lxml') bs = BeautifulSoup(data, 'lxml') # print(bs) # 2.根据css选择器获取标签 # select(css标签)——获取选择器选中所有标签 # select_one(css选择器)——获取选择器选中的第一个标签 result = bs.select('div>p') res = bs.select_one('div>p') print(result) print(res) # 3.获取标签内容 # 标签.string —— 获取标签的文字内容(如果标签内容有多个子标签或者同是存在文字和子标签,结果是None),返回值是字符串。 # 标签对象.get_text() —— 获取标签的文字内容(如果有子标签,会把子标签的文字内容一起取出来),返回值是字符串。 # 标签对象.contents —— 获取标签中文字和子标签,返回值是列表。 p1 = bs.select_one('div>p') print('p1-string', p1.string) # p1-string 11111 print('p1-text:', p1.get_text()) # p1-text: 11111 print('contents:', p1.contents) # contents: ['11111'] p2 = bs.select_one('#p1') print(p2) # <p class="title" id="p1" name="dromouse"><b>The Dormouse's story</b></p> print('p2-string:', p2.string) # p2-string: The Dormouse's story print('p2-text:', p2.get_text()) # p2-text: The Dormouse's story print('p2-contents:', p2.contents) # p2-contents: [<b>The Dormouse's story</b>] p3 = bs.select_one('#p2') print('p3:', p3.string) # None print('p3-text', p3.get_text()) # 文字1文字2 print('p3-contents:', p3.contents) # p3-contents: ['文字1', <b>文字2</b>] # 4.获取标签属性 # 标签对象.attrs——返回包含属性名和属性值的字典 # 标签对象.attrs[属性名]——返回属性值 img = bs.select_one('img') print(img.attrs) # {'src': 'E:\\liudehua.jpg'} print(img.attrs['src']) # E:\liudehua.jpg a = bs.select('a') print(a[-1].attrs['href']) # www.baidu.com # 5.在指定标签中获取子标签 # 标签对象.select(css选择器)——获取指定标签中选择器选中的所有标签 # 标签对象.select_one(css选择器)——获取指定标签中选择器选中的第一个标签 # print('所有的p标签:', bs.select('p')) div = bs.select_one('div') result = div.select('p') print('div中p标签:', result) # div中p标签: [<p>11111</p>, <p>22222</p>, <p id="p2">文字1<b>文字2</b></p>] if __name__ == '__main__': # get_net_data('https://lib.swpu.edu.cn/') data = """ <html> <head> <title>The Dormouse's story</title> </head> <body> <p id="p1" class="title" name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> <div> <p>11111</p> <p>22222</p> <a href="www.baidu.com">百度一下</a> <img src="E:\liudehua.jpg"> <p id="p2">文字1<b>文字2</b></p> </div> """ analysis_data(data)
-
csv文件操作
import csv # ========================将文件写入csv中======================= # ''' # 1.创建writer # csv.writer(文件对象)——以列表为单位,写入一行数据 # csv.DicWriter——以字典为单位写入一行数据 with open('files/test.csv', 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) # 2.写入数据 writer.writerow(['姓名', '性别', '年龄', '分数']) writer.writerows([ ['张三', '男', 26, 89], ['小红', '女', 22, 78], ['李四', '男', 21, 90] ]) # ''' # 2.用字典提供数据 with open('files/test2.csv', 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, ['name', 'age', 'gender', 'score']) # 第一行内容 # 方式1 writer.writerow({'name':'姓名','age':'年龄', 'gender':'性别', 'score':'分数' }) # '姓名','年龄','性别','分数' # 方式2 writer.writeheader() # 'name', 'age', 'gender', 'score' # 写一行 writer.writerow({'name':'andy','age':18, 'gender':'男', 'score':96}) writer.writerows([ {'name':'july','age':18, 'gender':'男', 'score':96}, {'name':'peter','age':22, 'gender':'男', 'score':88}, {'name':'jack','age':19, 'gender':'男', 'score':90} ]) # ===================读取csv文件内容============================= # 注意:任意一个csv文件都可以选择使用列表或者字典的方式 # 1.一行数据对应一个列表 with open('files/test.csv', 'r', newline='', encoding='utf-8') as f: # reader就是每一行内容对应的迭代器。(reader是一个迭代器,迭代器中的元素是每一行内容对应的列表) reader = csv.reader(f) next(reader) print(list(reader)) # [['张三', '男', '26', '89'], ['小红', 'nv', '22', '78'], ['李四', '男', '21', '90']] # 2.一行数据对应一个字典 with open('files/test.csv', 'r', newline='', encoding='utf-8') as f: reader = csv.DictReader(f) print(next(reader)) # {'姓名': '张三', '性别': '男', '年龄': '26', '分数': '89'}