批量处理文件时,同时有压缩文件和未压缩文件
import pandas as pd
import os
import zipfile
for path in os.listdir(DATA_PATH):
if path.endswith(".csv"):
df = pd.read_csv(DATA_PATH + "/" + path, dtype=str)
elif path.endswith(".zip"):
archive = zipfile.ZipFile(DATA_PATH + "/" + path, 'r')
xlfile = archive.open(path[:-3] + "csv")
df = pd.read_csv(xlfile, dtype=str)
else:
continue
去掉字符串中的标点符号和空格
import string
del_estr = str.maketrans({key: None for key in (string.punctuation + " ")})
name = name.translate(del_estr)
pandas删除不符合条件的行
names = ["abc", "def"]
df = df[~df["name"].isin(names)]
插入数据库是引号的转义问题
方法一,转义
def transfer_content(content):
if content is None:
return None
else:
string = ""
for c in content:
if c == '"':
string += '\\\"'
elif c == "'":
string += "\\\'"
elif c == "\\":
string += "\\\\"
else:
string += c
return string
sql = "INSERT INTO `test`.`hipac_product` (`name`, `code`, `params`, `detail_url`, `type`, `title`, " \
"`description`, `pid`) VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \
% (name, code, transfer_content(json.dumps(params, ensure_ascii=False)), target_url, _type, transfer_content(title), transfer_content(description), pid)
cursor.execute(sql)
方法二,使用cur.execute(sql, value)
def insert_image(cls, _type, url, pid):
sql = "INSERT INTO `scrapy_product_img` (`type`, `url`, `pid`) VALUES (%(type)s, %(url)s, %(pid)s)"
value = {
'type': _type,
'url': url,
'pid': pid
}
cur.execute(sql, value)
cnx.commit()
抓取网络上的csv文件并保存到数据库
engine = create_engine("mysql+mysqldb://{}:{}@{}/{}".format(USERNAME, PASSWORD, HOST, DB_NAME))
con = engine.connect()
cols = ["id", "name", "type", "d", "designation", "f", "g", "h", "i", "nationality", "k", "comments1"]
df = pd.read_csv("https://www.treasury.gov/ofac/downloads/consolidated/cons_prim.csv", names=cols, index_col=False)
df = df.drop(["d", "f", "g", "h", "i", "k"], axis=1)[:(len(df.index) - 1)]
df.to_sql(name="cons_prim", con=con, if_exists="append", index=False)
清理会导致csv文件格式错误的字符
dtypes = df.dtypes
for col in df.columns:
if object == dtypes[col]:
df[col] = df[col].apply(lambda x: str(x).replace("\r", " ").replace("\n", " ").replace(",", " ").strip())
获取图片并保存到内存中
from PIL import Image
def my_func(self, file_url, ext):
"""获取图片并存储在内存中"""
im = Image.open(urllib.urlopen(file_url))
fp = io.BytesIO()
format = Image.registered_extensions()['.' + ext]
im.save(fp, format)
return fp.getvalue()
上传图片字节
def create(im):
"""
上传图片
im: 图片字节
"""
files = {'image': ('a.jpg', im)}
r = requests.post('http://xxx.com/api/create.json', files=files)
return r.json()
用pandas快速爬取网页表格
import pandas as pd
import requests
from sqlalchemy import create_engine
conn = create_engine('mysql+mysqldb://{}:{}@{}:{}/{}?charset=utf8'.format(USERNAME, PASSWORD, HOST, PORT, DB_NAME))
html_province = requests.get("http://www.citypopulation.de/USA-Cities.html").text
pd.read_html(html_province)[0].to_sql("usa_province", conn, index=False, if_exists="append")
pd.read_html(html_province)[2].to_sql("usa_city", conn, index=False, if_exists="append")
通过ftp上传文件
import paramiko
handle = paramiko.Transport(("ftp.xxx.com", 22))
handle.connect(username=USERNAME, password=PASSWORD)
sftp = paramiko.SFTPClient.from_transport(handle)
if sftp:
logger.info("login success, now will execute some command")
sftp.put(file_name, "Inbox/" + file_name)
handle.close()
用requests获取的HTML有某系字符解码错误
def get_html(url):
content = requests.get(url, headers=headers).content
html = str(content, "gb2312", "ignore")
return html
pandas导入数据库时设置字符编码集
from sqlalchemy import create_engine
engine = create_engine("mysql+mysqldb://{}:{}@{}:{}/{}".format(USERNAME, PASSWORD, HOST, PORT, DB_NAME), connect_args={'charset':'utf8'})
pandas导入大量数据内存不够的解决方法
import pandas as pd
pd.read_csv(businesses_tsv, sep="\t").to_sql(name="businesses_new", con=conn, if_exists="append", index=False, chunksize=10000)
Firefox的headless模式
from selenium.webdriver.firefox.options import Options
from selenium import webdriver
firefox_options = Options()
firefox_options.headless = True
driver = webdriver.Firefox(executable_path=FIREFOX_EXECUTABLE_PATH, options=firefox_options)
Chrome的headless模式
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument("enable-automation")
options.add_argument("--no-sandbox")
options.add_argument("--disable-infobars")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-browser-side-navigation")
options.add_argument("--disable-gpu")
options.add_argument("--headless")
driver = webdriver.Chrome(chrome_options=options)
selenium多条件等待
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class QueryFinished:
def __call__(self,driver):
"""
用来结合webDriverWait判断是否出现查询完成
:param driver:
:return:
"""
is_no_data = bool(EC.visibility_of_element_located((By.CLASS_NAME, "no-data"))(driver))
if is_no_data:
return True
else:
is_finished = EC.presence_of_element_located((By.XPATH, '//*[@id="societyCodeCheck"]/table/tbody/tr'))
is_visible = bool(is_finished)
if is_visible:
return True
else:
return False
WebDriverWait(driver, 30, 0.5).until(QueryFinished())