文本处理学习笔记5

环境如下:
python-2.5.2

工作中经常遇到类似csv格式的文件文件,
为了处理起来方便,一般要导入到数据库中

MYSQL导入工具代码:

# coding:UTF-8

import os
import sys
import time
import MySQLdb

perread = 1048576 # 1M
original = 32
size_list = []
for i in range(125):
size_list.append(original)
original = original + 32
def getSize(aName, aSize):
for i in size_list:
if aSize<i:
return i
raise ValueError("字段太长 '%s' %s"%(aName, aSize))

def loadCSV(fileName, separtor=" "):
print "loadCSV", fileName, "..."
tf = file(fileName, "r")
connMys = MySQLdb.connect(host="127.0.0.1", user="root", passwd="pass", db="test", charset="UTF8")
begin = time.time()

mark = 0
count = 0
currLines = tf.readlines(perread)
count = count + len(currLines) - 1
currTable = [i.rstrip().split(separtor) for i in currLines if len(i.rstrip())>0]
columnName = currTable[0]
for i in columnName:
if not i:
raise ValueError("字段名不能为空!")
columnCount = len(columnName)
currTable = fillSpace(currTable, columnCount)
currTable = distinctColumn(currTable)
aName = os.path.split(fileName)[1].split(".")[0]
dropSQL, createSQL, insertSQL = buildCreate(aName, currTable)

cursorMys = connMys.cursor()
print dropSQL, ";"
cursorMys.execute(dropSQL)
print createSQL, ";"
cursorMys.execute(createSQL)
connMys.commit()
print insertSQL, ";"
for i in currTable[1:]:
cursorMys.execute(insertSQL, i)
connMys.commit()
print "insert", count, "lines."

while tf.tell() > mark:
mark = tf.tell()
currLines = tf.readlines(perread)
if not tf.tell() > mark:
break
count = count + len(currLines)
currTable = [i.rstrip().split(separtor) for i in currLines if len(i.rstrip())>0]
currTable = fillSpace(currTable, columnCount)
for i in currTable:
cursorMys.execute(insertSQL, i)
connMys.commit()
print "insert", count, "lines."

end = time.time()
print "Count:%s Time:%s"%(count, end - begin)
cursorMys.close()
connMys.close()
tf.close()

def fillSpace(aTable, colCount):
for ii, iv in enumerate(aTable):
while len(iv) < colCount:
iv.append("")
if len(iv) != colCount:
raise ValueError("字段不够多!" + str(iv))
return aTable

def distinctColumn(aTable):
columnNames = aTable[0]
ucolumnNames = set(columnNames)
if len(columnNames)==len(ucolumnNames):
return aTable
tcinclude = []
tcexclude = []
for i, v in enumerate(columnNames):
if v in tcinclude:
print "字段竟然重复", v
tcexclude.append(i)
else:
tcinclude.append(v)
result = []
for i in aTable:
result.append([v for j, v in enumerate(i) if j not in tcexclude])
return result

def buildCreate(aName, aTable):
columnNames = aTable[0]
columnSizes = [getSize(k, len(v)) for k, v in zip(aTable[0], aTable[1])]
for i in aTable[2:]:
for j, v in enumerate(i):
if not len(v)<columnSizes[j]:
columnSizes[j] = getSize(columnNames[j], len(v))

dropSQL = "DROP TABLE IF EXISTS " + aName
createSQL = ["CREATE TABLE IF NOT EXISTS " + aName]
createSQL.append("(")
for k, v in zip(columnNames, columnSizes):
currLines = " %s VARCHAR(%s),"%(k, v)
createSQL.append(currLines)
createSQL[-1] = " %s VARCHAR(%s)"%(k, v)
createSQL.append(")")
k = ", ".join(columnNames)
v = ", ".join(["%s" for i in range(len(columnNames))])
insertSQL = "INSERT INTO %s (%s) VALUES (%s)"%(aName, k, v)
return (dropSQL, "\n".join(createSQL), insertSQL)

def usage():
print "CSVloader -h"
print "CSVloader -f bsc.csv bts.csv"
print "CSVloader -d dir1 dir2"
print "CSVloader -f bsc.csv bts.csv -s ,"
print "CSVloader -d dir1 dir2 -s ,"
print "CSVloader -s , -f bsc.csv bts.csv"
print "CSVloader -s , -d dir1 dir2"
print " Options include:"
print " -h [help] - 打印帮助"
print " -s [separtor] - 使用的分割符,默认 tab"
print " -f [fileName] - 文件名,可以指定多个"
print " -d [dirName] - 文件夹,可以指定多个"
print
print " 注意:文件必须是 UTF-8 编码"

if __name__ == "__main__":
if "-h" in sys.argv:
usage()
elif "-d" in sys.argv:
separtor = " "
idxf = sys.argv.index("-d")
if "-s" in sys.argv:
idxs = sys.argv.index("-s")
separtor = sys.argv[idxs+1]
if idxs>idxf:
for i in sys.argv[idxf+1:idxs]:
for j in os.listdir(i):
loadCSV(os.path.join(i, j), separtor)
else:
for i in sys.argv[idxf+1:]:
for j in os.listdir(i):
loadCSV(os.path.join(i, j), separtor)
else:
for i in sys.argv[idxf+1:]:
for j in os.listdir(i):
loadCSV(os.path.join(i, j), separtor)
elif "-f" in sys.argv:
separtor = " "
idxf = sys.argv.index("-f")
if "-s" in sys.argv:
idxs = sys.argv.index("-s")
separtor = sys.argv[idxs+1]
if idxs>idxf:
for i in sys.argv[idxf+1:idxs]:
loadCSV(i, separtor)
else:
for i in sys.argv[idxf+1:]:
loadCSV(i, separtor)
else:
for i in sys.argv[idxf+1:]:
loadCSV(i, separtor)
else:
usage()




使用方法如下:
CSVloader -h
CSVloader -f bsc.csv bts.csv
CSVloader -d dir1 dir2
CSVloader -f bsc.csv bts.csv -s ,
CSVloader -d dir1 dir2 -s ,
CSVloader -s , -f bsc.csv bts.csv
CSVloader -s , -d dir1 dir2
Options include:
-h [help] - 打印帮助
-s [separtor] - 使用的分割符,默认 tab
-f [fileName] - 文件名,可以指定多个
-d [dirName] - 文件夹,可以指定多个

注意:文件必须是 UTF-8 编码

顺便写了个bat文件:
CSVloader.bat

@echo off
python CSVloader.py %*

(WinXP SP2,MySQL 5.0下使用通过,MySQL数据库使用UTF8编码,csv文件使用UTF8编码)
测试文本:A_OBJECT.txt

oid,city,type,pid
7,邢台市,103,7
8,廊坊市,103,8
1,沧州市,103,1
3,高邑县,104,4
4,河间市,104,1
2,赵县,104,4


Oracle导入工具:

# coding:GBK

import os
import sys
import time
import cx_Oracle

perread = 1048576 # 1M
# original = 64
# size_list = []
# while original<4000:
# size_list.append(original)
# original = original*2
original = 32
size_list = []
for i in range(125):
size_list.append(original)
original = original + 32
def getSize(aName, aSize):
for i in size_list:
if aSize<i:
return i
raise ValueError("字段太长 '%s' %s"%(aName, aSize))

def loadCSV(fileName, separtor=" "):
print "loadCSV", fileName, "..."
tf = file(fileName, "r")
connOra = cx_Oracle.connect("user", "pass", "192.168.1.110:1521/testdb")
begin = time.time()

mark = 0
count = 0
currLines = tf.readlines(perread)
count = count + len(currLines) - 1
currTable = [i.rstrip().split(separtor) for i in currLines if len(i.rstrip())>0]
columnName = currTable[0]
for i in columnName:
if not i:
raise ValueError("字段名不能为空!")
columnCount = len(columnName)
currTable = fillSpace(currTable, columnCount)
currTable = distinctColumn(currTable)
aName = os.path.split(fileName)[1].split(".")[0]
createSQL, insertSQL = buildCreate(aName, currTable)

cursorOra = connOra.cursor()
print createSQL, ";"
cursorOra.execute(createSQL)
connOra.commit()
print insertSQL, ";"
cursorOra.prepare(insertSQL)
cursorOra.executemany(None, currTable[1:])
connOra.commit()
print "insert", count, "lines."

while tf.tell() > mark:
mark = tf.tell()
currLines = tf.readlines(perread)
if not tf.tell() > mark:
break
count = count + len(currLines)
currTable = [i.rstrip().split(separtor) for i in currLines if len(i.rstrip())>0]
currTable = fillSpace(currTable, columnCount)
cursorOra.executemany(None, currTable)
connOra.commit()
print "insert", count, "lines."

end = time.time()
print "Count:%s Time:%s"%(count, end - begin)
cursorOra.close()
connOra.close()
tf.close()

def fillSpace(aTable, colCount):
for ii, iv in enumerate(aTable):
while len(iv) < colCount:
iv.append("")
if len(iv) != colCount:
raise ValueError("字段不够多!" + str(iv))
return aTable

def distinctColumn(aTable):
columnNames = aTable[0]
ucolumnNames = set(columnNames)
if len(columnNames)==len(ucolumnNames):
return aTable
tcinclude = []
tcexclude = []
for i, v in enumerate(columnNames):
if v in tcinclude:
print "字段竟然重复", v
tcexclude.append(i)
else:
tcinclude.append(v)
result = []
for i in aTable:
result.append([v for j, v in enumerate(i) if j not in tcexclude])
return result

def buildCreate(aName, aTable):
columnNames = aTable[0]
columnSizes = [getSize(k, len(v)) for k, v in zip(aTable[0], aTable[1])]
for i in aTable[2:]:
for j, v in enumerate(i):
if not len(v)<columnSizes[j]:
columnSizes[j] = getSize(columnNames[j], len(v))

createSQL = ["CREATE TABLE " + aName]
createSQL.append("(")
for k, v in zip(columnNames, columnSizes):
currLines = " %s VARCHAR2(%s),"%(k, v)
createSQL.append(currLines)
createSQL[-1] = " %s VARCHAR2(%s)"%(k, v)
createSQL.append(")")
k = ", ".join(columnNames)
v = ", ".join([":%s"%(i+1) for i in range(len(columnNames))])
insertSQL = "INSERT INTO %s (%s) VALUES (%s)"%(aName, k, v)
return ("\n".join(createSQL), insertSQL)

def usage():
print "CSVloader -h"
print "CSVloader -f bsc.csv bts.csv"
print "CSVloader -d dir1 dir2"
print "CSVloader -f bsc.csv bts.csv -s ,"
print "CSVloader -d dir1 dir2 -s ,"
print "CSVloader -s , -f bsc.csv bts.csv"
print "CSVloader -s , -d dir1 dir2"
print " Options include:"
print " -h [help] - 打印帮助"
print " -s [separtor] - 使用的分割符,默认 tab"
print " -f [fileName] - 文件名,可以指定多个"
print " -d [dirName] - 文件夹,可以指定多个"
print
print " 注意:文件必须是 GBK 编码"

if __name__ == "__main__":
if "-h" in sys.argv:
usage()
elif "-d" in sys.argv:
separtor = " "
idxf = sys.argv.index("-d")
if "-s" in sys.argv:
idxs = sys.argv.index("-s")
separtor = sys.argv[idxs+1]
if idxs>idxf:
for i in sys.argv[idxf+1:idxs]:
for j in os.listdir(i):
loadCSV(os.path.join(i, j), separtor)
else:
for i in sys.argv[idxf+1:]:
for j in os.listdir(i):
loadCSV(os.path.join(i, j), separtor)
else:
for i in sys.argv[idxf+1:]:
for j in os.listdir(i):
loadCSV(os.path.join(i, j), separtor)
elif "-f" in sys.argv:
separtor = " "
idxf = sys.argv.index("-f")
if "-s" in sys.argv:
idxs = sys.argv.index("-s")
separtor = sys.argv[idxs+1]
if idxs>idxf:
for i in sys.argv[idxf+1:idxs]:
loadCSV(i, separtor)
else:
for i in sys.argv[idxf+1:]:
loadCSV(i, separtor)
else:
for i in sys.argv[idxf+1:]:
loadCSV(i, separtor)
else:
usage()


(WinXP SP2,Oracle 11下使用通过,Oracle数据库使用GBK编码,csv文件使用GBK编码)


注意一下,小工具拿到文件后,
会以文件名作为表名,开始造表(create),
字段全部为字符串,各个字段长度会根据
第一批读入的数据进行计算(第一批读1M,可以调,防大文件用的),
还要注意一下
MySQL版本与Oracle版本有点区别,
MySQL版本有自动删除表功能,先试着删除一遍,才会建表,导入数据;
Oracle版本没有加删除表功能,只有建表导入数据过程。
(MySQL一般只在自己本本上使用,所以导入,改改文件再导入方便,无需去手动删除表,
Oracle一般会在服务器上导数据用,怕删掉不该删除的东西,所以未加这个功能)
再有MySQL版用UTF8编码,Oracle版用的GBK编码,这个跟数据库的设置有关
UTF8编码与GBK编码转换,可参考使用文本学习笔记3的批量转换工具
当然,文件不多也可以用文本编辑器手动另存转换。













评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值