由于使用cx_oracle连接oracle然后导入数据文件速度太慢(要导的数据有几百G),出于速度的考虑,选择了oracle自带的sql loader工具,使用python脚本写界面,与用户交互,然后使用system()去批量调用sql loader,对于导入失败的bad数据以及日志数据,我用了c++写了一个程序,把bad文件转换成了ok的数据,然后又写了一个脚本处理这些bad数据。
1、python3调用sqlldr工具导入数据的脚本实现。
#!C:\Python34\python.exe
'''
by dairen 20170703
'''
from tkinter import *
import tkinter.messagebox as messagebox
import cx_Oracle
import time
import datetime
import os
class Application(Frame):
def __init__(self, master=None):
Frame.__init__(self, master,height=2000,width=4000)
self.pack()
self.createWidgets()
def createWidgets(self):
self.label_1 = Label(self,text = "文件起始日期:")
self.label_2 = Label(self,text = "文件结束日期:")
self.label_1.grid(row = 0,column = 0)
self.label_2.grid(row = 1,column = 0)
self.Input1 = Entry(self)
self.Input2 = Entry(self)
self.Input1.grid(row = 0,column = 1)
self.Input2.grid(row = 1,column = 1)
self.Button1 = Button(self, text='开始导入',width=20, command=self.runmain)
self.Button1.grid(row =2,columnspan=2)
def runmain(self):
begin_date = self.Input1.get() or '20170607'
end_date = self.Input2.get() or '20170607'
messagebox.showinfo('Tips!', '您的输入是:%s 和 %s \n 导入即将开始!' %(begin_date,end_date))
#offset:358O b_Info 40-60
aaa=[' CHAR(42)',' CHAR(42)',' CHAR(42)',' CHAR(8)',' CHAR(3)',
' CHAR(1)',' CHAR(1)',' CHAR(1)',' CHAR(1)',' CHAR(1)',
' CHAR(6)',' CHAR(10)',' CHAR(11)',' CHAR(11)',' CHAR(11)',
' CHAR(11)',' CHAR(11)',' CHAR(21)',' CHAR(21)',' CHAR(21)',
' CHAR(4)',' CHAR(6)',' CHAR(12)',' CHAR(4)',' CHAR(6)',
' CHAR(4)',' CHAR(3)',' CHAR(2)',' CHAR(12)',' CHAR(6)',
' CHAR(8)',' CHAR(15)',' CHAR(65)',' CHAR(3)',' CHAR(4)',
' CHAR(6)',' CHAR(10)',' CHAR(1)',' CHAR(1)',' CHAR(5)'
,' CHAR(2)',' CHAR(2)',' CHAR(2)',' CHAR(2)',' CHAR(4)'
,' CHAR(4)',' CHAR(11)',' CHAR(11)',' NUMBER(12,0)',' NUMBER(12,0)'
,' NUMBER(12,0)',' NUMBER(12,0)',' CHAR(3)',' CHAR(3)',' NUMBER(8,0)'
,' CHAR(1)',' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)'
,' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)'
,' NUMBER(8,0)',' CHAR(8)',' CHAR(8)',' CHAR(8)',' NUMBER(8,0)'
,' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)'
,' NUMBER(8,0)',' NUMBER(8,0)',' NUMBER(8,0)',' CHAR(16)',' CHAR(2)'
,' CHAR(1)',' CHAR(2)',' CHAR(14)',' CHAR(4)',' CHAR(3)'
,' CHAR(9)',' NUMBER(12,0)',' CHAR(3)',' CHAR(1)',' CHAR(8)'
,' CHAR(1)',' CHAR(12)',' CHAR(15)',' CHAR(6)',' CHAR(4)'
,' CHAR(100)']
lenlist=[42,42,42,8,3,1,1,1,1,1,
6,10,11,11,11,11,11,21,21,21,
4,6,12,4,6,4,3,2,12,6,
8,15,40,3,4,6,10,1,1,5,
2,2,2,2,4,4,11,11,12,12,
12,12,3,3,8,1,8,8,8,8,
8,8,8,8,8,8,8,8,8,8,
8,8,8,8,8,8,8,8,16,2,
1,2,14,4,3,9,12,3,1,8,
1,12,15,6,4,100]
title=[
'Main_Key','Origin_Key','Er_Key','PostDate','Curr',
'If_Curr','If_Cen','Cups_Sett','If_Cvm_In','If_Single',
'Ssn_Trans','Time_Trans','Ob_Code','Trans_Code','Rec_Code',
'Cb_Code','Cvm_Code','Account11','Account_In','Account_Out',
'MsgType','ProcCode','TranAmt','ActDate','ActTime',
'MerChant_Type','Sevr_Point_In','Sevr_Point_Con','Search','AuthCode',
'Term','Ob_Id','Ob_Info','Curr_Tran','Reason_Code',
'Ssn_Trans_Ori','Time_Trans_Ori','Status_Snd','Status_Rec','Status_Trans',
'RejCode1','RejCode2','RejCode3','RejCode4','Sett_Snd_AreaCode',
'Sett_Rec_AreaCode','Sett_Snd_Code','Sett_Rec_Code' ,'Amt_Snd_Sett_dr','Amt_Snd_Sett_cr',
'Amt_Rec_Sett_dr','Amt_Rec_Sett_cr','Curr_Sett_Snd','Curr_Sett_Rec','Comm',
'Comm_Way','Comm_Snd_dr','Comm_Snd_cr','Comm_Rec_dr','Comm_Rec_cr',
'Fee_Snd_dr','Fee_Snd_cr','Fee_Rec_dr','Fee_Rec_cr','Comm_Cen_dr',
'Comm_Cen_cr','Fee_Snd_Branch_dr','Fee_Snd_Branch_cr','Fee_Rec_Branch_dr','Fee_Rec_Branch_cr',
'Fee_CanCel_Snd_dr','Fee_CanCel_Snd_cr','Fee_CanCel_Rec_dr','Fee_CanCel_Rec_cr','Fee_CanCel_Cen_dr',
'Fee_CanCel_Cen_cr','Fee_Cl_Snd_Branch_dr','Fee_Cl_Snd_Branch_cr','Fee_Cl_Rec_Branch_dr','Fee_Cl_Rec_Branch_cr',
'TranType','CardKind','CardType','Bin','Card',
'Er_Max','Ssn_Cen','TranAmt_Ori','TranCode_Ori','Reserved',
'variab1','variab2','variab3','variab4','variab5','variab6'
]
x=len(title)
fields=[]
fields = [title[i]+aaa[i] for i in range(0,x)]
fields_str = ', '.join(fields)
#begin_date=input("input start date(格式:20170506):")
#end_date=input("input end date(格式:20170606):")
date_list = []
begin_date = datetime.datetime.strptime(begin_date, "%Y%m%d")
end_date = datetime.datetime.strptime(end_date, "%Y%m%d")
if (begin_date > end_date):
messagebox.showinfo('error!', 'begin_date or end_date error!)')
while( begin_date <= end_date):
date_str = begin_date.strftime("%Y%m%d")
date_list.append(date_str)
begin_date += datetime.timedelta(days=1)
#print(date_list)
file_name_list=[]
numlist=['01','02','03','04']
for datename in date_list:
for num in numlist:
bsi_str='BSI%s01_%s'%(datename,num)
file_name_list.append(bsi_str)
for num in numlist:
bsi_str='BSI%s51_%s'%(datename,num)
file_name_list.append(bsi_str)
file_name_list.append('BSI%s88_01'%datename)
file_name_list.append('BSI%s99_01'%datename)
for num in numlist:
bsa_str='BSA%s01_%s'%(datename,num)
file_name_list.append(bsa_str)
for num in numlist:
bsa_str='BSA%s51_%s'%(datename,num)
file_name_list.append(bsa_str)
file_name_list.append('BSA%s88_01'%datename)
file_name_list.append('BSA%s99_01'%datename)
#print(file_name_list)
table_list=[]
for datename in date_list:
bsi_str='BSI_%s'%datename
table_list.append(bsi_str)
bsa_str='BSA_%s'%datename
table_list.append(bsa_str)
#print(table_list)
#开始计时
starttime=time.clock()
try:
conn = cx_Oracle.connect('xxx/xxx@xxx/orcl')
#print("connect oracle success!!!")
except:
#print("connect oracle failed!!!")
messagebox.showinfo('warning!', 'failed connect oracle!')
return
cursor = conn.cursor()
data_num=len(date_list)
file_num=len(file_name_list)
table_num=len(table_list)
#create table
for k in range(0,table_num):
table_name= table_list[k]
#print("当前表名:%s "%table_name)
sql_createtb = "create table %s (%s)" % (table_name, fields_str)
sql_droptb='drop table %s purge'%table_name
#print(sql_createtb)
#print(sql_droptb)
try:
#print("开始建表!")
cursor.execute(sql_createtb)
#print("建表成功!")
except:
#print("表已存在!开始删表!")
cursor.execute(sql_droptb)
#print("删表成功!开始重新建表!")
cursor.execute(sql_createtb)
#print("重新建表成功!")
conn.commit()
cursor.close()
conn.close()
#execute cmd according to fiel_name
succ=0
errornum=0
for i in range(0,file_num):
try:
file_name=file_name_list[i]
if os.path.isfile(file_name):
table_name=table_list[i//10]
#CHARACTERSET ZHS16GBK
#edit ctl files according to table_name
lines=open('bsa.ctl','r').readlines()
for k in range(len(lines)):
if 'INTO TABLE' in lines[k]:
lines[k]='INTO TABLE %s\n'%table_name
f=open('bsa.ctl','w')
f.writelines(lines)
f.close()
basedir=os.getcwd()
newname=file_name+".dat"
os.rename(file_name,newname)
bad_dir='%s\\bad_files'%basedir
log_dir='%s\\log_files'%basedir
if not os.path.exists(bad_dir):
os.mkdir(bad_dir)
if not os.path.exists(log_dir):
os.mkdir(log_dir)
#import
cmd_str="sqlldr xxx/xxx@xxx/orcl control=%s\\bsa.ctl data=%s "%(basedir,newname)
cmd_str+="bad=%s\\bad_files\\%s.bad log=%s\\log_files\\%s.log direct=true"%(basedir,file_name,basedir,file_name)
#print(cmd_str)
os.system(cmd_str)
succ+=1
#process bad file
except:
errornum+=1
endtime=time.clock()
messagebox.showinfo('完成!!','耗时约:%f秒,success in %d files'%((endtime-starttime),succ))
app = Application()
# 设置窗口标题:
app.master.title('CUPS IMPORTER')
# 主消息循环:
app.mainloop()
2、sqlldr规则文件的编写.
sqlldr读取文件有好几种方式,我要处理的源文件是字节流的,所以使用了按字节分隔的控制文件,如下:
options(ERRORS=20000)
load DATA
APPEND
INTO TABLE BSA_20170607
(
Main_Key POSITION(01:42) CHAR,
Origin_Key POSITION(43:84) CHAR,
Er_Key POSITION(85:126) CHAR,
PostDate POSITION(127:134) CHAR,
Curr POSITION(135:137) CHAR,
If_Curr POSITION(138:138) CHAR,
If_Cen POSITION(139:139) CHAR,
Cups_Sett POSITION(140:140) CHAR,
If_Cvm_In POSITION(141:141) CHAR,
If_Single POSITION(142:142) CHAR,
Ssn_Trans POSITION(143:148) CHAR,
Time_Trans POSITION(149:158) CHAR,
Ob_Code POSITION(159:169) CHAR,
Trans_Code POSITION(170:180) CHAR,
Rec_Code POSITION(181:191) CHAR,
Cb_Code POSITION(192:202) CHAR,
Cvm_Code POSITION(203:213) CHAR,
Account11 POSITION(214:234) CHAR,
Account_In POSITION(235:255) CHAR,
Account_Out POSITION(256:276) CHAR,
MsgType POSITION(277:280) CHAR,
ProcCode POSITION(281:286) CHAR,
TranAmt POSITION(287:298) CHAR,
ActDate POSITION(299:302) CHAR,
ActTime POSITION(303:308) CHAR,
MerChant_Type POSITION(309:312) CHAR,
Sevr_Point_In POSITION(313:315) CHAR,
Sevr_Point_Con POSITION(316:317) CHAR,
Search POSITION(318:329) CHAR,
AuthCode POSITION(330:335) CHAR,
Term POSITION(336:343) CHAR,
Ob_Id POSITION(344:358) CHAR,
Ob_Info POSITION(359:398) CHAR,
Curr_Tran POSITION(399:401) CHAR,
Reason_Code POSITION(402:405) CHAR,
Ssn_Trans_Ori POSITION(406:411) CHAR,
Time_Trans_Ori POSITION(412:421) CHAR,
Status_Snd POSITION(422:422) CHAR,
Status_Rec POSITION(423:423) CHAR,
Status_Trans POSITION(424:428) CHAR,
RejCode1 POSITION(429:430) CHAR,
RejCode2 POSITION(431:432) CHAR,
RejCode3 POSITION(433:434) CHAR,
RejCode4 POSITION(435:436) CHAR,
Sett_Snd_AreaCode POSITION(437:440) CHAR,
Sett_Rec_AreaCode POSITION(441:444) CHAR,
Sett_Snd_Code POSITION(445:455) CHAR,
Sett_Rec_Code POSITION(456:466) CHAR,
Amt_Snd_Sett_dr POSITION(467:478),
Amt_Snd_Sett_cr POSITION(479:490),
Amt_Rec_Sett_dr POSITION(491:502),
Amt_Rec_Sett_cr POSITION(503:514),
Curr_Sett_Snd POSITION(515:517) CHAR,
Curr_Sett_Rec POSITION(518:520) CHAR,
Comm POSITION(521:528) ,
Comm_Way POSITION(529:529) CHAR,
Comm_Snd_dr POSITION(530:537) ,
Comm_Snd_cr POSITION(538:545) ,
Comm_Rec_dr POSITION(546:553) ,
Comm_Rec_cr POSITION(554:561) ,
Fee_Snd_dr POSITION(562:569) ,
Fee_Snd_cr POSITION(570:577) ,
Fee_Rec_dr POSITION(578:585) ,
Fee_Rec_cr POSITION(586:593) ,
Comm_Cen_dr POSITION(594:601) ,
Comm_Cen_cr POSITION(602:609) ,
Fee_Snd_Branch_dr POSITION(610:617) CHAR,
Fee_Snd_Branch_cr POSITION(618:625) CHAR,
Fee_Rec_Branch_dr POSITION(626:633) CHAR,
Fee_Rec_Branch_cr POSITION(634:641) ,
Fee_CanCel_Snd_dr POSITION(642:649) ,
Fee_CanCel_Snd_cr POSITION(650:657) ,
Fee_CanCel_Rec_dr POSITION(658:665) ,
Fee_CanCel_Rec_cr POSITION(666:673) ,
Fee_CanCel_Cen_dr POSITION(674:681) ,
Fee_CanCel_Cen_cr POSITION(682:689) ,
Fee_Cl_Snd_Branch_dr POSITION(690:697) ,
Fee_Cl_Snd_Branch_cr POSITION(698:705) ,
Fee_Cl_Rec_Branch_dr POSITION(706:721) CHAR,
Fee_Cl_Rec_Branch_cr POSITION(722:723) CHAR,
TranType POSITION(724:724) CHAR,
CardKind POSITION(725:726) CHAR,
CardType POSITION(727:740) CHAR,
Bin POSITION(741:744) CHAR,
Card POSITION(745:747) CHAR,
Er_Max POSITION(748:756) CHAR,
Ssn_Cen POSITION(757:768) ,
TranAmt_Ori POSITION(769:771) CHAR,
TranCode_Ori POSITION(772:772) CHAR,
Reserved POSITION(773:780) CHAR,
variab1 POSITION(781:781) CHAR,
variab2 POSITION(782:793) CHAR,
variab3 POSITION(794:808) CHAR,
variab4 POSITION(809:814) CHAR,
variab5 POSITION(815:818) CHAR,
variab6 POSITION(819:918) CHAR
)
其中,我每次要针对不同的文件,把文件数据插入到不同的表,我都会读取ctl文件,修改其中的表名,然后再调用sqlldr命令。
3、c++处理sqlldr生成的bad文件,使之变成可以再次导入的文件.
头文件和cpp文件如下:
先是从网上找到的一个把gbk编码的字符数组转换成unicode编码的简单类:
// UtfFile.h: interface for the UtfFile class.
///////////////////////////////////////////////////////////
#if !defined(AFX_UTFFILE_H__8ED10D8A_D1A3_412F_A600_124F521CE4F1__INCLUDED_)
#define AFX_UTFFILE_H__8ED10D8A_D1A3_412F_A600_124F521CE4F1__INCLUDED_
#if _MSC_VER > 1000
#pragma once
#endif // _MSC_VER > 1000
#include <windows.h>
#include <stdio.h>
#include <locale.h>
#include <IOSTREAM>
#include <FSTREAM>
using namespace std;
class UtfFile
{
public:
static char * UnicodeToGB2312(WCHAR uData, char buffer[2]); // Unicode 转换成 GB2312
static WCHAR * UTF_8ToUnicode(char *pText, WCHAR &unicode); // 把UTF-8 转化成 Unicode
static char * TranslateUTF8ToGB(char *str, size_t len); // 把UTF-8字符串转化成ANSI(GB2312)编码形式
char * GetString(char *str, int maxLen = 1024); // 读取一个字符串,以换行符为结束标示
char * GetLine(char *str, int maxLen); // 读取一行字符
void close(); // 关闭文件流
int open(const char *sFileName); // 用于打开一个文件
UtfFile(const char *sour);
virtual ~UtfFile();
public:
ifstream inf;
};
#endif // !defined(AFX_UTFFILE_H__8ED10D8A_D1A3_412F_A600_124F521CE4F1__INCLUDED_)
该类的源文件:
/*-----------------------------------------------以下为.cpp文件内容------------------------------------------*/
// UtfFile.cpp: implementation of the UtfFile class.
//
//////////////////////////////////////////////////////////////////////
#include "UtfFile.h"
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
UtfFile::UtfFile(const char *sour)
:inf(sour)
{
if( !inf.is_open() )
{
char str[1024];
sprintf(str, "错误的文件路径,文件无法打开:%s", sour);
throw runtime_error(str);
}
}
UtfFile::~UtfFile()
{
}
void UtfFile::close()
{
inf.close();
}
char * UtfFile::GetLine(char *str, int maxLen)
{
if ( inf.eof() )
{
str[0] = '\0';
}
else
{
inf.getline(str, maxLen);
TranslateUTF8ToGB(str, maxLen);
}
return str;
}
char * UtfFile::GetString(char *str, int maxLen)
{
if ( inf.eof() )
{
str[0] = '\0';
}
else
{
inf >> str;
TranslateUTF8ToGB(str, maxLen);
}
return str;
}
char * UtfFile::UnicodeToGB2312(WCHAR uData, char buffer[2])
{
WideCharToMultiByte(CP_ACP,NULL, &uData, 1,buffer,sizeof(WCHAR),NULL,NULL);
return buffer;
}
WCHAR * UtfFile::UTF_8ToUnicode(char *pText, WCHAR &unicode)
{
/* http://blog.youkuaiyun.com/liuzhiyuan1982/article/details/3911150
UTF-8是一种多字节编码的字符集,表示一个Unicode字符时,它可以是1个至多个字节,在表示上有规律:
1字节:0xxxxxxx
2字节:110xxxxx 10xxxxxx
3字节:1110xxxx 10xxxxxx 10xxxxxx
4字节:11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
char *uchar = (char *)&unicode;
uchar[1] = ((pText[0] & 0x0F) << 4) + ((pText[1] >> 2) & 0x0F);
uchar[0] = ((pText[1] & 0x03) << 6) + (pText[2] & 0x3F);
return &unicode;
}
char * UtfFile::TranslateUTF8ToGB(char *str, size_t len)
{
char * newCharBuffer = new char[len];
int index =0;
int nCBIndex = 0;
WCHAR wTemp = 0;
char cTemp[2] = " ";
while(index < len)
{
if ( str[index] == 0 )
break;
else if(str[index] > 0) // 如果是GB2312的字符
{
newCharBuffer[nCBIndex] = str[index]; //直接复制
index += 1; //源字符串偏移量1
nCBIndex += 1; //目标字符串偏移量1
}
else //如果是UTF-8的字符
{
UTF_8ToUnicode(str + index, wTemp); //先把UTF-8转成Unicode
UnicodeToGB2312(wTemp, &newCharBuffer[nCBIndex]); //再把Unicode 转成 GB2312
index += 3; //源字符串偏移量3
nCBIndex += 2; //目标字符串偏移量2 因为一个中文UTF-8占3个字节,GB2312占两个字节
}
}
newCharBuffer[nCBIndex] = '\0'; //结束符
strcpy( str, newCharBuffer );
delete newCharBuffer; //避免内存泄漏,这是对源代码的稍许修改
newCharBuffer = NULL;
return str;
}
int UtfFile::open(const char *sFileName)
{
inf.open(sFileName);
return inf.is_open();
}
下面是我完成的main文件,大致就是对要处理的文件,读出一行,转码,替换里面的半中文字符‘?’,然后写入要输出的文件:
#include "UtfFile.h"
#include<string>
#include<vector>
#include<io.h>
#include<cstdio>
using namespace std;
void getFiles(string path, vector<string>& files)
{
//文件句柄
intptr_t hFile = 0;
//文件信息
struct _finddata_t fileinfo;
string p;
if ((hFile = _findfirst(p.assign(path).append("\\*").c_str(), &fileinfo)) != -1)
{
do
{
//如果是目录,迭代之
//如果不是,加入列表
if ((fileinfo.attrib & _A_SUBDIR))
{
if (strcmp(fileinfo.name, ".") != 0 && strcmp(fileinfo.name, "..") != 0)
getFiles(p.assign(path).append("\\").append(fileinfo.name), files);
}
else
{
files.push_back(p.assign(path).append("\\").append(fileinfo.name));
}
} while (_findnext(hFile, &fileinfo) == 0);
_findclose(hFile);
}
}
char* G2U(const char* gb2312)
{
int len = MultiByteToWideChar(CP_ACP, 0, gb2312, -1, NULL, 0);
wchar_t* wstr = new wchar_t[len + 1];
memset(wstr, 0, len + 1);
MultiByteToWideChar(CP_ACP, 0, gb2312, -1, wstr, len);
len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, NULL, 0, NULL, NULL);
char* str = new char[len + 1];
memset(str, 0, len + 1);
WideCharToMultiByte(CP_UTF8, 0, wstr, -1, str, len, NULL, NULL);
if (wstr) delete[] wstr;
return str;
}
void do_bad(string in_file, string out_file)
{
UtfFile myfile(in_file.c_str()); //读bad文件
ofstream outfile;
outfile.open(out_file, ios::trunc); //写文件
string tempstr;
char* mystr;
while (std::getline(myfile.inf, tempstr))
{
mystr = const_cast<char*>(tempstr.c_str());
mystr = G2U(mystr);
UtfFile::TranslateUTF8ToGB(mystr, 918);
char* str_bk = mystr;
for (int j = 0; j<918; j++)//可用find函数优化
{
if (*mystr == '?') //replace
{
*mystr = '1';
break;
}
mystr++;
}
outfile << str_bk << '\n';//
}
outfile.close();
myfile.close();
}
int main()
{
//char strbuffer[1024]={0};
//mystr=strbuffer;
//
char * filePath = "C:\\pythonfile\\sqlldr_bsibsa\\bad_process";
//\\bad_files
vector<string> files;
vector<string> in_files;
vector<string> out_files;
获取该路径下的所有文件
getFiles(filePath, files);
//BSA2017060701_01.bad
int total_size = files.size();
for (int i = 0; i < total_size; i++)
{
//cout << files[i] << endl;
int pos = files[i].find_last_of('\\');
string name_with_exp(files[i].substr(pos + 1));
string base_dir(files[i].substr(0,pos+1));
cout << name_with_exp.size() << endl;
if (name_with_exp.size() == 20)
{
in_files.push_back(files[i]);
//cout << s << endl;
string name_only = name_with_exp.substr(0, 16);
// char str[50];
cout << name_only << endl;
// sprintf(str,"%s.bad",name_only);
// cout << str << endl;
string full_path;
full_path = base_dir + name_only + ".dat";
//cout << full_path << endl;
out_files.push_back(full_path);
}
}
int input_size = in_files.size();
for (int i = 0; i < input_size; i++)
{
do_bad(in_files[i], out_files[i]);
}
//C:\\pythonfile\\sql_loader\\bad_test\\BSA2016010101.dat
//C:\\pythonfile\\sql_loader\\bad_test\\BSA2016010101.bad
//
//UtfFile myfile("C:\\pythonfile\\sql_loader\\bad_test\\1.dat"); //读bad文件
return 0;
}
4、c++程序处理好bad数据使之可以用之后,我又写了一个脚本,和专门的ctl文件,再次把这些bad文件批量导进去:
import os
basedir=os.getcwd()
#print(basedir)
files_list=os.listdir(basedir)
bad_dir='%s\\badd_files'%basedir
log_dir='%s\\logg_files'%basedir
if not os.path.exists(bad_dir):
os.mkdir(bad_dir)
if not os.path.exists(log_dir):
os.mkdir(log_dir)
succ=0
for file_name in files_list:
if len(file_name) != 20:
continue
if ('BS' in file_name) and ('dat' in file_name):
#print(file_name)
table_name=file_name[0:3]+'_'+file_name[3:11]
#print(table_name)
#newname = file_name[:-4] + '.dat'
#os.rename(file_name,newname)
lines=open('bsa_bad.ctl','r').readlines()
for k in range(len(lines)):
if 'INTO TABLE' in lines[k]:
lines[k]='INTO TABLE %s\n'%table_name
f=open('bsa_bad.ctl','w')
f.writelines(lines)
f.close()
file_base=file_name[0:16]
#print(file_base)
#import
cmd_str="sqlldr xxx/xxx@xxx/orcl control=%s\\bsa_bad.ctl data=%s "%(basedir,file_name)
cmd_str+="bad=%s\\badd_files\\%s.bad log=%s\\logg_files\\%s.log direct=true"%(basedir,file_base,basedir,file_base)
#print(cmd_str)
os.system(cmd_str)
succ+=1
else:
continue
print("succ:%d"%succ)
这个文件又生成了极小的bad文件,我分析了下,都是个别在源文件就是乱码的数据,可以分门别类留存下来,不必理会。
注意:其中oracle登陆远程服务器的用户名,密码,ip地址都被我用xxx代替了
欢迎提问!