python编程常用技巧

批量处理文件时,同时有压缩文件和未压缩文件

import pandas as pd
import os
import zipfile

for path in os.listdir(DATA_PATH):
	if path.endswith(".csv"):
		df = pd.read_csv(DATA_PATH + "/" + path, dtype=str)
	elif path.endswith(".zip"):
		archive = zipfile.ZipFile(DATA_PATH + "/" + path, 'r')
		xlfile = archive.open(path[:-3] + "csv")
		df = pd.read_csv(xlfile, dtype=str)
	else:
		continue

去掉字符串中的标点符号和空格

import string

del_estr = str.maketrans({key: None for key in (string.punctuation + " ")})
name = name.translate(del_estr)

pandas删除不符合条件的行

names = ["abc", "def"]
df = df[~df["name"].isin(names)]

插入数据库是引号的转义问题

方法一,转义

def transfer_content(content):
	if content is None:
		return None
	else:
		string = ""
		for c in content:
			if c == '"':
				string += '\\\"'
			elif c == "'":
				string += "\\\'"
			elif c == "\\":
				string += "\\\\"
			else:
				string += c
		return string

sql = "INSERT INTO `test`.`hipac_product` (`name`, `code`, `params`, `detail_url`, `type`, `title`, " \
          "`description`, `pid`) VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \
          % (name, code, transfer_content(json.dumps(params, ensure_ascii=False)), target_url, _type, transfer_content(title), transfer_content(description), pid)
cursor.execute(sql)

方法二,使用cur.execute(sql, value)

def insert_image(cls, _type, url, pid):
	sql = "INSERT INTO `scrapy_product_img` (`type`, `url`, `pid`) VALUES (%(type)s, %(url)s, %(pid)s)"
	value = {
		'type': _type,
		'url': url,
		'pid': pid
	}
	cur.execute(sql, value)
	cnx.commit()

抓取网络上的csv文件并保存到数据库

engine = create_engine("mysql+mysqldb://{}:{}@{}/{}".format(USERNAME, PASSWORD, HOST, DB_NAME))
con = engine.connect()

cols = ["id", "name", "type", "d", "designation", "f", "g", "h", "i", "nationality", "k", "comments1"]
df = pd.read_csv("https://www.treasury.gov/ofac/downloads/consolidated/cons_prim.csv", names=cols, index_col=False)
df = df.drop(["d", "f", "g", "h", "i", "k"], axis=1)[:(len(df.index) - 1)]
df.to_sql(name="cons_prim", con=con, if_exists="append", index=False)

清理会导致csv文件格式错误的字符

dtypes = df.dtypes
for col in df.columns:
	if object == dtypes[col]:
		df[col] = df[col].apply(lambda x: str(x).replace("\r", " ").replace("\n", " ").replace(",", " ").strip())

获取图片并保存到内存中

from PIL import Image

def my_func(self, file_url, ext):
	"""获取图片并存储在内存中"""
	im = Image.open(urllib.urlopen(file_url))
	fp = io.BytesIO()
	format = Image.registered_extensions()['.' + ext]
	im.save(fp, format)
	return fp.getvalue()

上传图片字节

def create(im):
	"""
	上传图片
	im: 图片字节
	"""
	files = {'image': ('a.jpg', im)}
	r = requests.post('http://xxx.com/api/create.json', files=files)
	return r.json()

用pandas快速爬取网页表格

import pandas as pd
import requests
from sqlalchemy import create_engine

conn = create_engine('mysql+mysqldb://{}:{}@{}:{}/{}?charset=utf8'.format(USERNAME, PASSWORD, HOST, PORT, DB_NAME))
html_province = requests.get("http://www.citypopulation.de/USA-Cities.html").text
pd.read_html(html_province)[0].to_sql("usa_province", conn, index=False, if_exists="append")
pd.read_html(html_province)[2].to_sql("usa_city", conn, index=False, if_exists="append")

通过ftp上传文件

import paramiko

handle = paramiko.Transport(("ftp.xxx.com", 22))
handle.connect(username=USERNAME, password=PASSWORD)
sftp = paramiko.SFTPClient.from_transport(handle)
if sftp:
	logger.info("login success, now will execute some command")
	sftp.put(file_name, "Inbox/" + file_name)
	handle.close()

用requests获取的HTML有某系字符解码错误

def get_html(url):
    content = requests.get(url, headers=headers).content
    html = str(content, "gb2312", "ignore")
    return html

pandas导入数据库时设置字符编码集

from sqlalchemy import create_engine

engine = create_engine("mysql+mysqldb://{}:{}@{}:{}/{}".format(USERNAME, PASSWORD, HOST, PORT, DB_NAME), connect_args={'charset':'utf8'})

pandas导入大量数据内存不够的解决方法

import pandas as pd

pd.read_csv(businesses_tsv, sep="\t").to_sql(name="businesses_new", con=conn, if_exists="append", index=False, chunksize=10000)

Firefox的headless模式

from selenium.webdriver.firefox.options import Options
from selenium import webdriver

firefox_options = Options()
firefox_options.headless = True
driver = webdriver.Firefox(executable_path=FIREFOX_EXECUTABLE_PATH, options=firefox_options)

Chrome的headless模式

from selenium.webdriver.chrome.options import Options
from selenium import webdriver

options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument("enable-automation")
options.add_argument("--no-sandbox")
options.add_argument("--disable-infobars")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-browser-side-navigation")
options.add_argument("--disable-gpu")
options.add_argument("--headless")
driver = webdriver.Chrome(chrome_options=options)

selenium多条件等待

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

class QueryFinished:
    def __call__(self,driver):
        """
        用来结合webDriverWait判断是否出现查询完成
        :param driver:
        :return:
        """
        is_no_data = bool(EC.visibility_of_element_located((By.CLASS_NAME, "no-data"))(driver))
        if is_no_data:
            return True
        else:
            is_finished = EC.presence_of_element_located((By.XPATH, '//*[@id="societyCodeCheck"]/table/tbody/tr'))
            is_visible = bool(is_finished)
            if is_visible:
                return True
            else:
                return False

WebDriverWait(driver, 30, 0.5).until(QueryFinished())

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值