MongoDB replication (2)
OpLog启动流程:
mongoDbMain() => _initAndListen(listenPort) => listen(listenPort)
listen()函数:
logStartup();
startReplication();
server->run();
startReplication() => startMasterSlave()
if ( replSettings.slave ) {
verify( replSettings.slave == SimpleSlave );
LOG(1) << "slave=true" << endl;
boost::thread repl_thread(replSlaveThread);
}
if ( replSettings.master ) {
LOG(1) << "master=true" << endl;
replSettings.master = true;
createOplog();
boost::thread t(replMasterThread);
}
创建OpLog:
void createOplog() {
Lock::GlobalWrite lk;
const char * ns = "local.oplog.$main";
bool rs = !replSettings.replSet.empty();
if( rs )
ns = rsoplog;
Client::Context ctx(ns);
Collection* collection = ctx.db()->getCollection( ns );
if ( collection ) {
if (replSettings.oplogSize != 0) {
int o = (int)(collection->storageSize() / ( 1024 * 1024 ) );
int n = (int)(replSettings.oplogSize / (1024 * 1024));
if ( n != o ) {
stringstream ss;
ss << "cmdline oplogsize (" << n << ") different than existing (" << o << ") see: http://dochub.mongodb.org/core/increase-oplog";
log() << ss.str() << endl;
throw UserException( 13257 , ss.str() );
}
}
if( rs ) return;
DBDirectClient c;
BSONObj lastOp = c.findOne( ns, Query().sort(reverseNaturalObj) );
if ( !lastOp.isEmpty() ) {
OpTime::setLast( lastOp[ "ts" ].date() );
}
/* create an oplog collection, if it doesn't yet exist. */
BSONObjBuilder b;
double sz;
if (replSettings.oplogSize != 0)
sz = (double)replSettings.oplogSize;
else {
/* not specified. pick a default size */
sz = 50.0 * 1024 * 1024;
if ( sizeof(int *) >= 8 ) {
#if defined(__APPLE__)
// typically these are desktops (dev machines), so keep it smallish
sz = (256-64) * 1024 * 1024;
#else
sz = 990.0 * 1024 * 1024;
intmax_t free =
File::freeSpace(storageGlobalParams.dbpath); //-1 if call not supported.
double fivePct = free * 0.05;
if ( fivePct > sz )
sz = fivePct;
// we use 5% of free space up to 50GB (1TB free)
double upperBound = 50.0 * 1024 * 1024 * 1024;
if (fivePct > upperBound)
sz = upperBound;
#endif
}
}
log() << "******" << endl;
log() << "creating replication oplog of size: " << (int)( sz / ( 1024 * 1024 ) ) << "MB..." << endl;
b.append("size", sz);
b.appendBool("capped", 1);
}
log() << "******" << endl;
log() << "creating replication oplog of size: " << (int)( sz / ( 1024 * 1024 ) ) << "MB..." << endl;
b.append("size", sz);
b.appendBool("capped", 1);
b.appendBool("autoIndexId", false);
string err;
BSONObj o = b.done();
userCreateNS(ns, o, err, false);
if( !rs )
logOp( "n", "", BSONObj() );
/* sync here so we don't get any surprising lag later when we try to sync */
MemoryMappedFile::flushAll(true);
log() << "******" << endl;
}
插入oplog记录:
/*@ @param opstr:
c userCreateNS
i insert
n no-op / keepalive
d delete / remove
u update
*/
void logOp(const char* opstr,
const char* ns,
const BSONObj& obj,
BSONObj* patt,
bool* b,
bool fromMigrate,
const BSONObj* fullObj) {
try {
if ( replSettings.master ) {
_logOp(opstr, ns, 0, obj, patt, b, fromMigrate);
}
logOpForSharding(opstr, ns, obj, patt, fullObj, fromMigrate);
logOpForDbHash(opstr, ns, obj, patt, fullObj, fromMigrate);
getGlobalAuthorizationManager()->logOp(opstr, ns, obj, patt, b);
if ( strstr( ns, ".system.js" ) ) {
Scope::storedFuncMod(); // this is terrible
}
}
真正的实现在_logOpOld():
static void _logOpOld(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, bool fromMigrate ) {
Lock::DBWrite lk("local");
static BufBuilder bufbuilder(8*1024); // todo there is likely a mutex on this constructor
if ( strncmp(ns, "local.", 6) == 0 ) {
if ( strncmp(ns, "local.slaves", 12) == 0 ) {
resetSlaveCache();
}
return;
}
mutex::scoped_lock lk2(OpTime::m);
const OpTime ts = OpTime::now(lk2);
Client::Context context("", 0);
/* we jump through a bunch of hoops here to avoid copying the obj buffer twice --
instead we do a single copy to the destination position in the memory mapped file.
*/
bufbuilder.reset();
BSONObjBuilder b(bufbuilder);
b.appendTimestamp("ts", ts.asDate());
b.append("op", opstr);
b.append("ns", ns);
if (fromMigrate)
b.appendBool("fromMigrate", true);
if ( bb )
b.appendBool("b", *bb);
if ( o2 )
b.append("o2", *o2);
BSONObj partial = b.done(); // partial is everything except the o:... part.
if( logNS == 0 ) {
logNS = "local.oplog.$main";
}
BSON是MongoDB的数据格式。
ts:8字节的时间戳,由4字节unix timestamp + 4字节自增计数表示。
这个值很重要,在选举(如master宕机时)新primary时,会选择ts最大的那个secondary作为新primary。
op:1字节的操作类型,例如i表示insert,d表示delete。
ns:操作所在的namespace。
o:操作所对应的document,即当前操作的内容(比如更新操作时要更新的的字段和值)
o2: 在执行更新操作时的where条件,仅限于update时才有该属性
其中op可以是:
"i": insert
"u": update
"d": delete
"c": db cmd
"db":声明当前数据库 (其中ns 被设置成为=>数据库名称+ '.')
"n": no op,即空操作,其会定期执行以确保时效性
写入Oplog集合:
OplogDocWriter writer( partial, obj );
checkOplogInsert( localOplogMainCollection->insertDocument( &writer, false ) );
replMasterThread() => logKeepalive() => _logOp("n", "", 0, BSONObj(), 0, 0, false);
写入空操作,保证响应printReplicationStatus() 和printSlaveReplicationStatus()。
replMain()函数:
_replMain()
=> ReplSource::Sync()
=> ReplSource::applyOperation( op )
=> Sync::shouldRetry(const BSONObj& o)