2019年4月19号下午查看相关日志,发现磁盘报错在日志中记录频繁,自我盘查无果之后,当天汇报给公司后商量后并通过raid的相关命令查询,确定第五台存储的2号盘磁盘坏道,初步给出的结论是磁盘坏道,但是不影响使用,后期建议换盘,并将相关信息反馈给客户,但是事后两天,突然一台存储raid卡故障,导致磁盘只读不可写,磁盘被36内小时drop,因为做了normal冗余,两副本都坏,所以导致数据丢失。以下为恢复的过程,恢复都是在晚上进行的。历时5天,数据库不停机。心痛啊。但是对于我这个刚毕业一年的人来说,确实学到了很多东西。。。。。但是代价还是蛮大的ing
-
坏块的查找
- 使用rman方式扫描全库找出逻辑坏块此run块会扫描数据库所有存在的物理坏块以及逻辑坏块,并保存到v$database_block_corruption这个视图当中
run{
ALLOCATE CHANNEL c1 DEVICE TYPE DISK;
ALLOCATE CHANNEL c2 DEVICE TYPE DISK;
ALLOCATE CHANNEL c3 DEVICE TYPE DISK;
ALLOCATE CHANNEL c4 DEVICE TYPE DISK;
ALLOCATE CHANNEL c5 DEVICE TYPE DISK;
ALLOCATE CHANNEL c6 DEVICE TYPE DISK;
ALLOCATE CHANNEL c7 DEVICE TYPE DISK;
ALLOCATE CHANNEL c8 DEVICE TYPE DISK;
ALLOCATE CHANNEL c9 DEVICE TYPE DISK;
ALLOCATE CHANNEL c10 DEVICE TYPE DISK;
ALLOCATE CHANNEL c11 DEVICE TYPE DISK;
ALLOCATE CHANNEL c12 DEVICE TYPE DISK;
VALIDATE DATABASE CHECK LOGICAL;
}
- 查看坏块所在的对象,以及文件号,块号。
select * from v$database_block_curruption;
- 查看坏块所属的对象,以及用户。
select 'select owner,segment_name,partition_name from dba_extents where file_id='||file_id#||' and '||blcok#||'' and block_id and block_id+blocks -1 ;'from v$database_block_corrupion
-
根据以上的查询再查询有坏块的表是否为分区表
select * from dba_tables where table_name in ('SRV_ORDER_DETAIL','SRV_INTERFACE_LOG_RSS','BILLING_LOG','RME_PORT','ADDR_SEGM_GS')
and owner='RESGS'
- 查询分区表坏块所在的分区
select 'SELECT /*+parallel(10)*/COUNT(*) from RESGS.SRV_ORDER_DETAIL partition('||partition_name||');'
from dba_tab_partitions where table_name in ('SRV_ORDER_DETAIL') and table_owner='RESGS'
- 由于此次没有备份,并且备份的数据不可用,所以使用cas方式进行不完全恢复。第一步:保留以上表的统计信息,使用包DBMS_STATS.CREATE_STAT_TABLE
BEGIN
DBMS_STATS.CREATE_STAT_TABLE(
'RESGS',
'STAT_TMP'
);
END;
/
- 保留坏块表的统计信息
BEGIN
DBMS_STATS.EXPORT_TABLE_STATS(OWNNAME =>'RESGS',TABNAME => 'SRV_ORDER_DETAIL',STATTAB => 'STAT_TMP');
DBMS_STATS.EXPORT_TABLE_STATS(OWNNAME =>'RESGS',TABNAME => 'SRV_INTERFACE_LOG_RSS',STATTAB => 'STAT_TMP');
DBMS_STATS.EXPORT_TABLE_STATS(OWNNAME =>'RESGS',TABNAME => 'RME_PORT',STATTAB => 'STAT_TMP');
DBMS_STATS.EXPORT_TABLE_STATS(OWNNAME =>'RESGS',TABNAME => 'ADDR_SEGM_GS',STATTAB => 'STAT_TMP');
DBMS_STATS.EXPORT_TABLE_STATS(OWNNAME =>'RESGS',TABNAME => 'BILLING_LOG',STATTAB => 'STAT_TMP');
END;
/
- 查看相关表的索引,并将统计信息保留。
查看坏块表的索引
select owner,
table_name,
index_name,
index_type,
partitioned,
status,
tablespace_name,
degree
from dba_indexes
where table_name in ('SRV_ORDER_DETAIL',
'SRV_INTERFACE_LOG_RSS',
'RME_PORT',
'ADDR_SEGM_GS',
'BILLING_LOG')
order by owner, table_name, index_name;
保留索引的统计信息
BEGIN
DBMS_STATS.EXPORT_INDEX_STATS(OWNNAME =>'RESGS',INDNAME => 'PK_T_TEST',STATTAB => 'STAT_TMP');
DBMS_STATS.EXPORT_INDEX_STATS(OWNNAME =>'RESGS',INDNAME => 'T_TEST_LOCAL',STATTAB => 'STAT_TMP');
END;
/
- 创建备份表,没有数据。
create table SRV_ORDER_DETAIL_plz_bak as select * from SRV_ORDER_DETAIL PARTITION(P_LZ) where 0=1;
create table SRV_INTERFACE_LOG_RSS_psr_bak as select * from SRV_INTERFACE_LOG_RSS PARTITION(P_SRV_INTERF_LOG_RSS_201904) where 0=1;
create table RME_PORT_ppl_bak as select * from RME_PORT PARTITION(P_PL) where 0=1;
create table RME_PORT_pts_bak as select * from RME_PORT PARTITION(P_TS) where 0=1;
create table ADDR_SEGM_GS_bak as select * from ADDR_SEGM_GS where 0=1;
create table BILLING_LOG_bak as select * from BILLING_LOG where 0=1;
-
跳过坏块,把好的数据插入到备份的表中
标记和跳过坏块(用session级别)
alter session set events ='10231 trace name context forever,level 10';
分区表:
alter session enable parallel dml;
insert /*+APPEND*/ into SRV_ORDER_DETAIL_plz_bak select /*+PARALLEL(48)*/* from SRV_ORDER_DETAIL PARTITION(P_LZ);
commit;
alter session enable parallel dml;
insert /*+APPEND*/ into SRV_INTERFACE_LOG_RSS_psr_bak select /*+PARALLEL(48)*/* from SRV_INTERFACE_LOG_RSS PARTITION(P_SRV_INTERF_LOG_RSS_201904);
commit;
alter session enable parallel dml;
insert /*+APPEND*/ into RME_PORT_ppl_bak select /*+PARALLEL(48)*/* from RME_PORT PARTITION(P_PL);
commit;
alter session enable parallel dml;
insert /*+APPEND*/ into RME_PORT_pts_bak select /*+PARALLEL(48)*/* from RME_PORT PARTITION(P_TS);
commit;
SQL> insert /*+APPEND*/ into RESGS.SRV_INTERFACE_LOG_RSS_psr_0425 select /*+PARALLEL(24)*/* from RESGS.SRV_INTERFACE_LOG_RSS PARTITION(P_SRV_INTERF_LOG_RSS_201904);
insert /*+APPEND*/ into RESGS.SRV_INTERFACE_LOG_RSS_psr_0425 select /*+PARALLEL(24)*/* from RESGS.SRV_INTERFACE_LOG_RSS PARTITION(P_SRV_INTERF_LOG_RSS_201904)
ORA-12801: 并行查询服务器 P00E, instance gsdx-qdrac01:resdb1 (1) 中发出错误信号
ORA-01578: ORACLE 数据块损坏 (文件号 173, 块号 2028684)
ORA-01110: 数据文件 173: '+SASDG2/resdb/datafile/woqu1/general_170.dbf'
普通表:
alter session enable parallel dml;
insert /*+APPEND*/ into ADDR_SEGM_GS_bak select /*+PARALLEL(48)*/* from ADDR_SEGM_GS;
commit;
alter session enable parallel dml;
insert /*+APPEND*/ into BILLING_LOG_bak select /*+PARALLEL(48)*/* from BILLING_LOG;
commit;
- 将坏块表中的数据truncate。
坏块表中数据truncate掉
分区表:
alter table SRV_ORDER_DETAIL truncate PARTITION(P_LZ);
alter table SRV_INTERFACE_LOG_RSS truncate PARTITION(P_SRV_INTERF_LOG_RSS_201904);
alter table RME_PORT truncate PARTITION(P_PL);
alter table RME_PORT truncate PARTITION(P_TS);
普通表:
truncate table ADDR_SEGM_GS;
truncate table BILLING_LOG;
- 索引的处理
查看索引
select owner,
table_name,
index_name,
index_type,
partitioned,
status,
tablespace_name,
degree
from dba_indexes
where table_name in ('SRV_ORDER_DETAIL',
'SRV_INTERFACE_LOG_RSS',
'RME_PORT',
'ADDR_SEGM_GS',
'BILLING_LOG')
order by owner, table_name, index_name;
失效索引进行rebuild
全局索引rebuild:
alter index pk_t_test rebuild tablespace [xxx] parallel 32 online;
本地分区索引rebuild:
alter index t_test_local rebuild partition t_test_2013 tablespace [xxx] parallel 32 online;
- 备份表中的数据插入原表
alter session enable parallel dml;
insert into SRV_ORDER_DETAIL select /*+PARALLEL(48)*/* from SRV_ORDER_DETAIL_plz_bak;
commit;
alter session enable parallel dml;
insert into SRV_INTERFACE_LOG_RSS select /*+PARALLEL(48)*/* from SRV_INTERFACE_LOG_RSS_psr_bak;
commit;
alter session enable parallel dml;
insert into RME_PORT select /*+PARALLEL(48)*/* from RME_PORT_ppl_bak;
commit;
alter session enable parallel dml;
insert into RME_PORT select /*+PARALLEL(48)*/* from RME_PORT_pts_bak;
commit;
insert into ADDR_SEGM_GS select /*+PARALLEL(48)*/* from ADDR_SEGM_GS_bak;
commit;
alter session enable parallel dml;
insert into BILLING_LOG select /*+PARALLEL(48)*/* from BILLING_LOG_bak;
commit;
- 还原统计信息
BEGIN
DBMS_STATS.DELETE_TABLE_STATS(OWNNAME => 'RESGS', TABNAME => 'SRV_ORDER_DETAIL');
DBMS_STATS.DELETE_TABLE_STATS(OWNNAME => 'RESGS', TABNAME => 'SRV_INTERFACE_LOG_RSS');
DBMS_STATS.DELETE_TABLE_STATS(OWNNAME => 'RESGS', TABNAME => 'RME_PORT');
DBMS_STATS.DELETE_TABLE_STATS(OWNNAME => 'RESGS', TABNAME => 'ADDR_SEGM_GS');
DBMS_STATS.DELETE_TABLE_STATS(OWNNAME => 'RESGS', TABNAME => 'BILLING_LOG');
END;
/
- 导入备份的统计信息到系统中
BEGIN
DBMS_STATS.IMPORT_TABLE_STATS(OWNNAME => 'RESGS', TABNAME => 'SRV_ORDER_DETAIL', STATTAB => 'STAT_TMP');
DBMS_STATS.IMPORT_TABLE_STATS(OWNNAME => 'RESGS', TABNAME => 'SRV_INTERFACE_LOG_RSS', STATTAB => 'STAT_TMP');
DBMS_STATS.IMPORT_TABLE_STATS(OWNNAME => 'RESGS', TABNAME => 'RME_PORT', STATTAB => 'STAT_TMP');
DBMS_STATS.IMPORT_TABLE_STATS(OWNNAME => 'RESGS', TABNAME => 'ADDR_SEGM_GS', STATTAB => 'STAT_TMP');
DBMS_STATS.IMPORT_TABLE_STATS(OWNNAME => 'RESGS', TABNAME => 'BILLING_LOG', STATTAB => 'STAT_TMP');
END;
/
- 因为存在分区表。但是在使用10231事件跳过坏块所在的分区的时候检测到坏块无法跳过。这就导值了此分区中好的数据无法取出来,因为考虑到前端业务的使用,只能重新创建了业务相关的表。之后再通过cas方式,将好的分区插入到新表中。剩下两坏块所在的分区打算通过rowid分片的方式找回,再这里说明一下,本来打算使用全库恢复的,但是备份不可用,可用的备份在3天前,rowdi分片找的方式就是用备份搭建备库,使用3天前全量备份的库找回坏的分区中的数据。rowid分片找回数据的操作如下。
- 思路如下
select object_name,subobject_name,data_object_id from dba_objects where object_name='SRV_ORDER_DETAIL' and subobject_name='P_LZ';
select object_name,subobject_name,data_object_id from dba_objects where object_name='RME_PORT' and subobject_name='P_TS';
select object_name,subobject_name,data_object_id from dba_objects where object_name='RME_PORT' and subobject_name='P_PL';
select object_name,subobject_name,data_object_id from dba_objects where object_name='SRV_INTERFACE_LOG_RSS' and subobject_name='P_SRV_INTERF_LOG_RSS_201904';
以上是通过查找相关坏块中所在的段ID值,后面的rowid要用到。
----------------------------------------------------------------------------------------------------
select rowid,dbms_rowid.rowid_object(rowid)object_id,
dbms_rowid.rowid_relative_fno(rowid) file_id,
dbms_rowid.rowid_block_number(rowid) block_id,
dbms_rowid.rowid_row_number(rowid) row_id
from test0428;
------------------------------------------------------------------------------------------------------
SRV_ORDER_DETAIL
object_id
file_id 165 ACl
block_id 3648391 AAN6uH
select rowid from SRV_ORDER_DETAIL where substr(rowid,0,6)='&object_id' and substr(rowid,7,3)='ACl' and substr(rowid,10,6)='AAN6uH';
create table SRV_ORDER_DETAIL_TMP0428 as select * from SRV_ORDER_DETAIL where rowid in (
select rowid from SRV_ORDER_DETAIL where substr(rowid,0,6)='&object_id' and substr(rowid,7,3)='ACl' and substr(rowid,10,6)='AAN6uH';
)
----------------------------------------
RME_PORT
object_id
file_id 45 AAt
block_id 871209 AADUsp
select rowid from RME_PORT where substr(rowid,0,6)='&object_id' and substr(rowid,7,3)='AAt' and substr(rowid,10,6)='AADUsp';
create table RME_PORT_TMP0428 as select * from RME_PORT where rowid in (
select rowid from RME_PORT where substr(rowid,0,6)='&object_id' and substr(rowid,7,3)='AAt' and substr(rowid,10,6)='AADUsp';
)
RME_PORT
object_id
file_id 43 AAr
block_id 352117 AABV91
select rowid from RME_PORT where substr(rowid,0,6)='&object_id' and substr(rowid,7,3)='AAr' and substr(rowid,10,6)='AABV91';
insert into table RME_PORT_TMP0428 as select * from RME_PORT where rowid in (
select rowid from RME_PORT where substr(rowid,0,6)='&object_id' and substr(rowid,7,3)='AAr' and substr(rowid,10,6)='AABV91';
)
----------------------------------------
SRV_INTERFACE_LOG_RSS
object_id
file_id 183 AC3
block_id 2827911 AAKyaH
select rowid from SRV_INTERFACE_LOG_RSS where substr(rowid,0,6)='&object_id' and substr(rowid,7,3)='ACl' and substr(rowid,10,6)='AAN6uH';
create table SRV_INTERFACE_LOG_RSS_TMP0428 as select * from SRV_INTERFACE_LOG_RSS where rowid in (
select rowid from SRV_INTERFACE_LOG_RSS where substr(rowid,0,6)='&object_id' and substr(rowid,7,3)='AAt' and substr(rowid,10,6)='AADUsp';
)
SRV_INTERFACE_LOG_RSS
object_id
file_id 242 ADy
block_id 3036173 AALlQN
select rowid from SRV_INTERFACE_LOG_RSS where substr(rowid,0,6)='&object_id' and substr(rowid,7,3)='ADy' and substr(rowid,10,6)='AALlQN';
insert into table SRV_INTERFACE_LOG_RSS_TMP0428 as select * from SRV_INTERFACE_LOG_RSS where rowid in (
select rowid from SRV_INTERFACE_LOG_RSS where substr(rowid,0,6)='&object_id' and substr(rowid,7,3)='ADy' and substr(rowid,10,6)='AALlQN';
)
SRV_INTERFACE_LOG_RSS
object_id
file_id 173 ACt
block_id 2028684 AAHvSM
select rowid from SRV_INTERFACE_LOG_RSS where substr(rowid,0,6)='&object_id' and substr(rowid,7,3)='ACt' and substr(rowid,10,6)='AAHvSM';
insert into table SRV_INTERFACE_LOG_RSS_TMP0428 as select * from SRV_INTERFACE_LOG_RSS where rowid in (
select rowid from SRV_INTERFACE_LOG_RSS where substr(rowid,0,6)='&object_id' and substr(rowid,7,3)='ACt' and substr(rowid,10,6)='AAHvSM';
)
----------------------------------------
BILLING_LOG
object_id 11808359 AAtC5n
file_id 45 AAt
block_id 992115 AADyNz
select rowid from BILLING_LOG where substr(rowid,0,6)='&object_id' and substr(rowid,7,3)='AAt' and substr(rowid,10,6)='AADyNz';
create table BILLING_LOG_TMP0428 as select * from BILLING_LOG where rowid in (
select rowid from BILLING_LOG where substr(rowid,0,6)='&object_id' and substr(rowid,7,3)='AAt' and substr(rowid,10,6)='AADyNz';
)
----------------------------------------
ADDR_SEGM_GS
object_id 72817944 AEVx0Y
file_id 30 AAe
block_id 302212 AABJyE
select rowid from ADDR_SEGM_GS where substr(rowid,0,6)='AEVx0Y' and substr(rowid,7,3)='AAe' and substr(rowid,10,6)='AABJyE';
create table ADDR_SEGM_GS_TMP0428 as select * from ADDR_SEGM_GS where rowid in (
select rowid from ADDR_SEGM_GS where substr(rowid,0,6)='AEVx0Y' and substr(rowid,7,3)='AAe' and substr(rowid,10,6)='AABJyE';
)
- 将以上表中的数据跟生产库进行对比,对比后,通过dblink将表创建到生产库中,又前端业务人员将数据插入到生产库中,至此数据最大化恢复已经完成,但是备份人员在全备数据库的时候,报有坏块的出现,而且客户还比较着急,通过v$database_block_corruption查看有坏块,但是坏块不属于任何对象。以下为逻辑坏块的处理方法,通过扩展相关段的高水位线,插入数据进行覆盖就可以消除,之后全备成功。
-
相关逻辑坏块的修复
-
创建一张表,使得创建的表在这坏块所属的表空间中。
create table s (
n number,
c varchar2(4000)
) nologging tablespace <tablespace name having the corrupt block> pctfree 99;
- 查看是否属于所属的表空间
select segment_name,tablespace_name from user_segments
where segment_name='S' ;
3、查看创建的表已经在坏块所在的表空间中。
Select table_name,tablespace_name from user_tables where table_name='S' ;
4、
CREATE OR REPLACE TRIGGER corrupt_trigger
AFTER INSERT ON <username>.s
REFERENCING OLD AS p_old NEW AS new_p
FOR EACH ROW
DECLARE
corrupt EXCEPTION;
BEGIN
IF (dbms_rowid.rowid_block_number(:new_p.rowid)=&blocknumber)
and (dbms_rowid.rowid_relative_fno(:new_p.rowid)=&filenumber) THEN
RAISE corrupt;
END IF;
EXCEPTION
WHEN corrupt THEN
RAISE_APPLICATION_ERROR(-20000, 'Corrupt block has been formatted');
END;
5.查找坏坏所在的文件号以及块号
Select BYTES from dba_free_space where file_id=<file no> and <corrupt block no> between block_id and block_id + blocks -1; 65536
6. 手动的扩展坏块所在区的大小,直到它能够在坏块之上。使用循环语句插入,直到整个文件被填满。
alter table <username>.s
allocate extent (DATAFILE 'E:\xxxx\<datafilename>.ORA' SIZE 64K);
BEGIN
for i in 1..1000000 loop
EXECUTE IMMEDIATE 'alter table <username>.s allocate extent (DATAFILE '||'''E:\xxxx\<datafilename>.ORA''' ||'SIZE 64K) ';
end loop;
end ;
/
7.插入数据。覆盖逻辑坏块。
BEGIN
FOR i IN 1..1000000000 LOOP
INSERT /*+ APPEND */ INTO <username>.s select i, lpad('REFORMAT',3092, 'R') from dual;
commit ;
END LOOP;
END;
8.DROP TABLE <username>.s ;