GetMatch

最新推荐文章于 2021-11-11 08:08:00 发布

原创最新推荐文章于 2021-11-11 08:08:00 发布 · 572 阅读

CC 4.0 BY-SA版权


template < typename STATE >
int ExtRanker_T<STATE>::GetMatches ()
{
	if ( !m_pRoot )
		return 0;

	int iMatches = 0;
	const ExtHit_t * pHlist = m_pHitlist;
	const ExtDoc_t * pDocs = m_pDoclist;

	// warmup if necessary
	if ( !pHlist )
	{
		if ( !pDocs ) pDocs = GetFilteredDocs ();
		if ( !pDocs ) return iMatches;

		pHlist = m_pRoot->GetHitsChunk ( pDocs, m_uMaxID );
		if ( !pHlist ) return iMatches;
	}

	// main matching loop
	const ExtDoc_t * pDoc = pDocs;
	for ( SphDocID_t uCurDocid=0; iMatches<ExtNode_i::MAX_DOCS; )
	{
		// keep ranking
		while ( pHlist->m_uDocid==uCurDocid )
			//计算
		 	m_tState.Update ( pHlist++ );

		// if hits block is over, get next block, but do *not* flush current doc
		if ( pHlist->m_uDocid==DOCID_MAX )
		{
			assert ( pDocs );
			pHlist = m_pRoot->GetHitsChunk ( pDocs, m_uMaxID );
			if ( pHlist )
				continue;
		}

		// otherwise (new match or no next hits block), flush current doc
		if ( uCurDocid )
		{
			assert ( uCurDocid==pDoc->m_uDocid );
			Swap ( m_dMatches[iMatches], m_dMyMatches[pDoc-m_dMyDocs] );
			m_dMatches[iMatches].m_iWeight = m_tState.Finalize ( m_dMatches[iMatches] );
			iMatches++;
		}

		// boundary checks
		if ( !pHlist )
		{
			// there are no more hits for current docs block; do we have a next one?
			assert ( pDocs );
			pDoc = pDocs = GetFilteredDocs ();

			// we don't, so bail out
			if ( !pDocs )
				break;

			// we do, get some hits
			pHlist = m_pRoot->GetHitsChunk ( pDocs, m_uMaxID );
			assert ( pHlist ); // fresh docs block, must have hits
		}

		// skip until next good doc/hit pair
		assert ( pDoc->m_uDocid<=pHlist->m_uDocid );
		while ( pDoc->m_uDocid<pHlist->m_uDocid ) pDoc++;
		assert ( pDoc->m_uDocid==pHlist->m_uDocid );

		uCurDocid = pHlist->m_uDocid;
	}

	m_pDoclist = pDocs;
	m_pHitlist = pHlist;
	return iMatches;
}

通过下边这段代码可以发现;

		while ( pHlist->m_uDocid==uCurDocid )
			//计算
		 	m_tState.Update ( pHlist++ );

如果pHlist指针指向的ExtHit_t结构的id，是这个id，就一直调用Update函数，来填充m_tState的数据，他最终存储的是一个id的综合评分。

/// hit in the stream
struct ExtHit_t
{
	SphDocID_t	m_uDocid;
	Hitpos_t	m_uHitpos;
	WORD		m_uQuerypos;
	WORD		m_uNodepos;
	WORD		m_uSpanlen;
	WORD		m_uMatchlen;
	DWORD		m_uWeight;
};

ExtHit_t的结构如上，m_uDocid存储id信息。而m_uHitpos字段存储了很多信息，哪个字段，是否结束，位置，以及最长公共子串。

字段值最大是多少呢？

if ( m_uCurLCS>m_uLCS[uField] )
			m_uLCS[uField] = m_uCurLCS;

BYTE m_uLCS[SPH_MAX_FIELDS];

#define SPH_MAX_FIELDS			256

可以发现最大值是256.而字段值就存储在m_uHitpos最前面的8位，最小为0，最大为256。

	static inline DWORD GetLCS ( Hitpos_t uHitpos )
	{
		//1左移23位取反与uHitpos进行与操作
		return uHitpos & ~FIELDEND_MASK;
	}

正数的补码是其本身，负数的补码是除符号位取反加1。

此处一定要区分反码与取反的区别：

1的反码还是1

1按位取反，11111....1110，而计算机在存储的时候是补码存储的，-2的补码刚好是这个，所以1按位取反是-2.

同样，00000000 10000000 00000000 00000000
取反，11111111 01111111 11111111 11111111

减1， 11111111 01111111 11111111 11111110

除符号位取反

10000000 10000000 00000000 00000001

所以他的反码是-8388609

这是数据举例

indextool -c ../etc/csft_daquan_suggest_all.conf --dumphitlist csft_daquan_suggest_web dai

通过indextool工具查看索引的情况，可以看到索引的内容，此语句查找命中dai的doc和hit

doc=500163049, hit=0x00800001
doc=500163049, hit=0x01800001

mysql> select * from IBO_suggest_info_web where id=500163049 \G;
*************************** 1. row ***************************
id: 500163049
type: 5
title: Daisy
weight: 2012583
pinyin: Daisy
is_web_removed: 1
is_client_removed: 1

在上边介绍的图中可以发现0x00800001的前8位，即00表示命中的字段，在此例中有00和01，

在上边数据库数据中也可以发现，title和pinyin中都有dai的数据。

0x00800001中的80，则表示在字段开头或结尾时命中。

后16位为，0001，表示title分词的第一个命中，

下边再举一例

doc=117884, hit=0x00800002

mysql> select * from IBO_suggest_info_web where id=117884 \G;
*************************** 1. row ***************************
id: 117884
type: 0
title: 茫然一代
weight: 1976000
pinyin: mangranyidai
is_web_removed: 1
is_client_removed: 1

可以用messg查看分词情况：

茫然/x 一代/x

一代在分词的第二个位置，并且是后边没内容了，所以80，后边为02。