MongoDB replication (2)

最新推荐文章于 2022-03-15 10:11:40 发布

Aegeaner

最新推荐文章于 2022-03-15 10:11:40 发布

阅读量574

点赞数

CC 4.0 BY-SA版权

本文链接：https://blog.youkuaiyun.com/Aegeaner/article/details/56277148

MongoDB replication (2)

OpLog启动流程：

mongoDbMain() => _initAndListen(listenPort) => listen(listenPort)

listen()函数：

logStartup();
startReplication();
server->run();

startReplication() => startMasterSlave()

       if ( replSettings.slave ) {
            verify( replSettings.slave == SimpleSlave );
            LOG(1) << "slave=true" << endl;
            boost::thread repl_thread(replSlaveThread);
        }

        if ( replSettings.master ) {
            LOG(1) << "master=true" << endl;
            replSettings.master = true;
            createOplog();
            boost::thread t(replMasterThread);
        }

创建OpLog:

void createOplog() {
        Lock::GlobalWrite lk;

        const char * ns = "local.oplog.$main";

        bool rs = !replSettings.replSet.empty();
        if( rs )
            ns = rsoplog;

        Client::Context ctx(ns);
        Collection* collection = ctx.db()->getCollection( ns );

        if ( collection ) {

            if (replSettings.oplogSize != 0) {
                int o = (int)(collection->storageSize() / ( 1024 * 1024 ) );
                int n = (int)(replSettings.oplogSize / (1024 * 1024));
                if ( n != o ) {
                    stringstream ss;
                    ss << "cmdline oplogsize (" << n << ") different than existing (" << o << ") see: http://dochub.mongodb.org/core/increase-oplog";
                    log() << ss.str() << endl;
                    throw UserException( 13257 , ss.str() );
                }
            }

            if( rs ) return;

            DBDirectClient c;
            BSONObj lastOp = c.findOne( ns, Query().sort(reverseNaturalObj) );
            if ( !lastOp.isEmpty() ) {
                OpTime::setLast( lastOp[ "ts" ].date() );
            }

        /* create an oplog collection, if it doesn't yet exist. */
        BSONObjBuilder b;
        double sz;
        if (replSettings.oplogSize != 0)
            sz = (double)replSettings.oplogSize;
        else {
            /* not specified. pick a default size */
            sz = 50.0 * 1024 * 1024;
            if ( sizeof(int *) >= 8 ) {
#if defined(__APPLE__)
                // typically these are desktops (dev machines), so keep it smallish
                sz = (256-64) * 1024 * 1024;
#else
                sz = 990.0 * 1024 * 1024;
                intmax_t free =
                    File::freeSpace(storageGlobalParams.dbpath); //-1 if call not supported.
                double fivePct = free * 0.05;
                if ( fivePct > sz )
                    sz = fivePct;
                // we use 5% of free space up to 50GB (1TB free)
                double upperBound = 50.0 * 1024 * 1024 * 1024;
                if (fivePct > upperBound)
                    sz = upperBound;
#endif
            }
        }

        log() << "******" << endl;
        log() << "creating replication oplog of size: " << (int)( sz / ( 1024 * 1024 ) ) << "MB..." << endl;

        b.append("size", sz);
        b.appendBool("capped", 1);
        }

        log() << "******" << endl;
        log() << "creating replication oplog of size: " << (int)( sz / ( 1024 * 1024 ) ) << "MB..." << endl;

        b.append("size", sz);
        b.appendBool("capped", 1);
        b.appendBool("autoIndexId", false);

        string err;
        BSONObj o = b.done();
        userCreateNS(ns, o, err, false);
        if( !rs )
            logOp( "n", "", BSONObj() );

        /* sync here so we don't get any surprising lag later when we try to sync */
        MemoryMappedFile::flushAll(true);
        log() << "******" << endl;
    }

插入oplog记录：

   /*@ @param opstr:
          c userCreateNS
          i insert
          n no-op / keepalive
          d delete / remove
          u update
    */
    void logOp(const char* opstr,
               const char* ns,
               const BSONObj& obj,
               BSONObj* patt,
               bool* b,
               bool fromMigrate,
               const BSONObj* fullObj) {

        try {
            if ( replSettings.master ) {
                _logOp(opstr, ns, 0, obj, patt, b, fromMigrate);
            }

            logOpForSharding(opstr, ns, obj, patt, fullObj, fromMigrate);
            logOpForDbHash(opstr, ns, obj, patt, fullObj, fromMigrate);
            getGlobalAuthorizationManager()->logOp(opstr, ns, obj, patt, b);

            if ( strstr( ns, ".system.js" ) ) {
                Scope::storedFuncMod(); // this is terrible
            }
        }

真正的实现在_logOpOld（）:

static void _logOpOld(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, bool fromMigrate ) {
        Lock::DBWrite lk("local");
        static BufBuilder bufbuilder(8*1024); // todo there is likely a mutex on this constructor

        if ( strncmp(ns, "local.", 6) == 0 ) {
            if ( strncmp(ns, "local.slaves", 12) == 0 ) {
                resetSlaveCache();
            }
            return;
        }

        mutex::scoped_lock lk2(OpTime::m);

        const OpTime ts = OpTime::now(lk2);
        Client::Context context("", 0);

        /* we jump through a bunch of hoops here to avoid copying the obj buffer twice --
           instead we do a single copy to the destination position in the memory mapped file.
        */

        bufbuilder.reset();
        BSONObjBuilder b(bufbuilder);
        b.appendTimestamp("ts", ts.asDate());
        b.append("op", opstr);
        b.append("ns", ns);
        if (fromMigrate)
            b.appendBool("fromMigrate", true);
        if ( bb )
            b.appendBool("b", *bb);
        if ( o2 )
            b.append("o2", *o2);
        BSONObj partial = b.done(); // partial is everything except the o:... part.

        if( logNS == 0 ) {
            logNS = "local.oplog.$main";
        }

BSON是MongoDB的数据格式。

ts：8字节的时间戳，由4字节unix timestamp + 4字节自增计数表示。
        这个值很重要，在选举(如master宕机时)新primary时，会选择ts最大的那个secondary作为新primary。
    op：1字节的操作类型，例如i表示insert，d表示delete。
    ns：操作所在的namespace。
    o：操作所对应的document,即当前操作的内容（比如更新操作时要更新的的字段和值）
    o2: 在执行更新操作时的where条件，仅限于update时才有该属性

其中op可以是：

 "i"： insert
     "u"： update
     "d"： delete
     "c"： db cmd
     "db"：声明当前数据库 (其中ns 被设置成为=>数据库名称+ '.')
     "n":  no op,即空操作，其会定期执行以确保时效性

写入Oplog集合：

OplogDocWriter writer( partial, obj );
        checkOplogInsert( localOplogMainCollection->insertDocument( &writer, false ) );

replMasterThread() => logKeepalive() => _logOp("n", "", 0, BSONObj(), 0, 0, false);
写入空操作，保证响应printReplicationStatus() 和printSlaveReplicationStatus()。

replMain()函数：

_replMain() 
=> ReplSource::Sync() 
=> ReplSource::applyOperation( op ) 
=> Sync::shouldRetry(const BSONObj& o)