- 博客(18)
- 收藏
- 关注
原创 pyppeteer实战
import loggingimport asynciofrom pyppeteer import launchfrom pyppeteer.errors import TimeoutErrorfrom motor.motor_asyncio import AsyncIOMotorClient#数据库操作motor_connect_string = 'mongodb://localhost:27017'momgodb_name = 'movie'mongo_collection_name.
2022-03-26 20:50:48
255
原创 huya弹幕提取小尝试
"""爬取实时的虎牙弹幕""""""不出现重复弹幕""""""不遗漏弹幕"""from selenium import webdriverimport timeweb = webdriver.Chrome()web.get('https://www.huya.com/52333')"""第一版"""# while True:# bullets_chat = web.find_elements_by_xpath('//*[@id="chat-room__list"]/div/d.
2022-03-26 14:53:31
889
原创 motor 异步存储 实战
""" 1.爬取所有页面书的信息 2.取出id,构造url,爬取详情页信息 3.motor异步存储"""import aiohttpimport loggingimport asynciofrom motor.motor_asyncio import AsyncIOMotorClientconcurrency = 10session = Nonelogging.basicConfig(level=logging.INFO,format='%(levelname)s-.
2022-03-25 22:59:14
416
原创 pymongo 基础操作
import pymongoclient = pymongo.MongoClient('mongodb://localhost:27017/')#连接数据库db = client.test #等价于client['test'],创建test数据库#指定集合collection = db.students #等价于db['students']#插入数据student = { "id":"20170101", "name":"Jorden", "age":20, ..
2022-03-25 22:40:08
307
原创 Selinium动作链 + 运行Javascript
from selenium import webdriverfrom selenium.webdriver import ActionChainsbrowser = webdriver.Chrome()url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'browser.get(url)browser.switch_to.frame('iframeResult')source = browser.fi.
2022-03-24 16:59:46
141
原创 异步协程超时设置 + 并发限制
import aiohttpimport asyncioasync def main(): url = 'https://www.httpbin.org/get' timeout = aiohttp.ClientTimeout(total=1.5) async with aiohttp.ClientSession(timeout=timeout) as session: try: async with session.get(url) .
2022-03-23 21:37:15
1882
原创 数据保存三种格式 TXT,json,csv
#txt文本保存data = 'data'with open('filename.txt',mode='w',encoding='utf-8') as f: f.write(data)#json文件保存import jsonstr = '''[{"2":"5"},{"1":"4"}]'''data =json.load(open('filename.json',encoding='utf-8')) #载入json文件json.dump(open(str,'filename.j.
2022-03-19 19:56:45
1105
1
原创 httpx库Client对象的使用+http2.0的使用
# httpx的cilent对象使用# 方法1import httpxheaders = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4750.0 Safari/537.36"}with httpx.Client(headers=headers) as client: resp = client.get("https://www.
2022-03-19 01:06:34
1145
原创 selenium 配置无头浏览器
from selenium.webdriver import Chromefrom selenium.webdriver.support.select import Select#无头from selenium.webdriver.chrome.options import Options# 准备好参数配置opt = Options()opt.add_argument("--headless")opt.add_argument("--disable-gpu")web = Chrome(.
2022-03-05 22:05:22
816
原创 异步爬虫模板 加上过时警告
import requestsimport asyncioasync def download(url): print("开始下载") # 这里可以改成await asyncio.requests.get() await asyncio.sleep(3) print("下载完成")async def main(): urls =[ "https://www.baidu.com", "https://www.bilibil...
2022-01-27 02:03:39
1077
原创 异步协程小练习
# 简单的多任务异步协程import asyncioimport timefrom pip import mainasync def func1(): print("你好啊,我叫李雪琴") await asyncio.sleep(3) print("你好啊,我叫李雪琴")async def func2(): print("你好啊,我叫潘金莲") await asyncio.sleep(3) print("你好啊,我叫潘金莲")asyn.
2022-01-27 01:50:29
273
原创 抓网抑云热评小练习
import requests# 网抑云评论XHR链接url = "https://music.163.com/weapi/comment/resource/comments/get?csrf_token="data = { "params": "jSBWuuxiWpaY1ca404/E2u+GWnIEFculrIxEEW3p/hIXCEcITyikYob0zr8EzKcyP0UM7+UILfSJdTnVKYNAXiOFc/2T/Tk0rfe7qImewe6x82DlF2YNK+T2j5.
2022-01-27 00:24:46
1038
原创 用进程池提高操作效率
import csvfrom concurrent.futures import ThreadPoolExecutorimport requestsf= open("新发地.csv",mode="w",encoding="utf-8",newline="") csvwriter = csv.writer(f)def down_load_allpages(): passdef down_load_onepage(datas): url = "http://www.xinfa.
2022-01-26 23:33:17
270
原创 创建线程的两种方法
第一种:# 启动每一个程序默认都有一个主线程from threading import Threaddef func(): for i in range(1000): print("func",i)if __name__ == "__main__": t = Thread(target=func) t.start() for i in range(1000): print("主线程",i)第二种:run()里记得加
2022-01-25 18:41:59
215
原创 Xpath练习 - 猪八戒网
import requestsfrom lxml import etreeimport csv# 请求数据url = "https://chongqing.zbj.com/search/f/?kw=python"headers = { "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/53.
2022-01-25 04:42:35
1144
1
原创 xpath练习-优美图
from lxml import etreeimport requestsurl = "https://www.youmeitu.com/meinv/"headers = { "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36" }resp = requests.get.
2022-01-24 14:20:20
207
原创 BeautifulSoup小练习-优美图
import requestsfrom bs4 import BeautifulSoupurl = "https://www.youmeitu.com/meinv/"img_urls = []headers = { "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36" ..
2022-01-24 13:21:41
332
原创 python re模块findall、finditer、search、match、compile用法
import re# findall匹配字符串中所有的符合正则的内容lst = re.findall("\d+","我的电话号码:10086,我女朋友的电话是:10010")print(lst) #打印10086/n10010# finditer 匹配字符串中所有的内容[返回的是迭代器]it = re.finditer("\d+","我的电话号码:10086,我女朋友的电话是:10010")print(it)for i in it: print(i.group())#打印10086/n.
2022-01-23 18:01:03
405
空空如也
空空如也
TA创建的收藏夹 TA关注的收藏夹
TA关注的人