length={8}, pfRtv={ 1.000000} length={27}, pfRtv={ 2.000000} length={64}, pfRtv={ 3.000000}
doc0 start
10 2.000000 11 3.000000 12 6.000000 13 11.000000 14 18.000000 15 7.000000 16 18.000000 17 11.000000
doc1 start
11 3.000000 12 4.000000 13 7.000000 14 12.000000 15 19.000000 16 8.000000 17 19.000000 18 12.000000 19 7.000000 20 4.000000 21 3.000000 22 4.000000 23 7.000000 24 12.000000 25 19.000000 26 8.000000 27 19.000000 28 12.000000 29 7.000000 30
4.000000 31 3.000000 32 4.000000 33 7.000000 34 12.000000 35 19.000000 36 8.000000 37 19.000000
doc2 start
12 4.000000 13 5.000000 14 8.000000 15 13.000000 16 0.000000 17 9.000000 18 0.000000 19 13.000000 20 8.000000 21 5.000000 22 4.000000 23 5.000000 24 8.000000 25 13.000000 26 0.000000 27 9.000000 28 0.000000 29 13.000000 30 8.000000 31 5.000000
32 4.000000 33 5.000000 34 8.000000 35 13.000000 36 0.000000 37 9.000000 38 0.000000 39 13.000000 40 8.000000 41 5.000000 42 4.000000 43 5.000000 44 8.000000 45 13.000000 46 0.000000 47 9.000000 48 0.000000 49 13.000000 50 8.000000 51 5.000000
52 4.000000 53 5.000000 54 8.000000 55 13.000000 56 0.000000 57 9.000000 58 0.000000 59 13.000000 60 8.000000 61 5.000000 62 4.000000 63 5.000000 64 8.000000 65 13.000000 66 0.000000 67 9.000000 68 0.000000 69 13.000000 70 8.000000 71 5.000000
72 4.000000 73 5.000000 74 8.000000 75 13.000000
result
12 0.000397
13 0.000610
14 0.001007
15 0.001282
16 0.000519
17 0.001160
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
typedef struct _SSDOC_ITEM_TMP
{
ushort iDocID;
float tf;
} SSDOC_ITEM_TMP;
#define RTV_TYPE float
#define MAX_USHORT 65535
#define PAGE_SEARCH_ARG int
#define BUCKET_VECTOR_BLOCK 64*(1024)
typedef struct _TERM_DIC_HASH_VALUE_INTER
{
float * pfRtv;
int length;
}TERM_DIC_HASH_VALUE_INTER;
int find(int left,int right,ushort docid ,SSDOC_ITEM_TMP *data )
{
int mid =(left +right )/2;
if (left==mid )
{
if( docid==data [right].iDocID)
return right;
else if( docid==data [left].iDocID)
return left;
else
return -1;
}
else if( docid > data [ mid ].iDocID )
return find ( mid ,right ,docid , data);
else
return find (left , mid , docid , data);
}
void GetSameDocBSearch( int &nCommon, SSDOC_ITEM_TMP *common_doc, SSDOC_ITEM_TMP *left,SSDOC_ITEM_TMP *right,
int left_length ,int right_length,RTV_TYPE rtv)
{
int i,j;
for(i=0;i<left_length;i++,left++)
{
short pos=0;
if ( -1== (pos=find ( 0 ,right_length , left->iDocID ,right )))
{
continue;
}
else
{
common_doc[ nCommon].iDocID=left->iDocID ;
common_doc[ nCommon ].tf = left->tf + (((float)(right+pos)->tf) / MAX_USHORT) * rtv;
nCommon++;
}
}
}
void GetSameDocMergeSearch( int &nCommon, SSDOC_ITEM_TMP *common_doc, SSDOC_ITEM_TMP *left,SSDOC_ITEM_TMP *right,
int left_length ,int right_length,RTV_TYPE rtv)
{
int left_tmp=0,right_tmp=0;
while(1)
{
if(left_tmp>=left_length)
{
return ;
}
if(right_tmp>=right_length)
{
return ;
}
if (left->iDocID ==right->iDocID)
{
common_doc[ nCommon].iDocID=left->iDocID ;
common_doc[ nCommon ].tf = left->tf + (((float)right->tf) / MAX_USHORT) * rtv;
nCommon++;
left_tmp++;
left++;
right_tmp++;
right++;
}
else if (left->iDocID < right->iDocID)
{
left_tmp++;
left++;
}
else
{
right_tmp++;
right++;
}
}
}
void GetSameDocWrap(ushort sBucketPos, int &nSameCount,SSDOC_ITEM_TMP *aSameDoc, int nDocArray, SSDOC_ITEM_TMP *pDocArray[],
PAGE_SEARCH_ARG *pArg, const TERM_DIC_HASH_VALUE_INTER * const pDocList, off_t *loffset)
{
off_t lloffset[nDocArray];
SSDOC_ITEM_TMP *pTmpDocArray[nDocArray];
memset(lloffset, 0, sizeof(off_t) * nDocArray);
memcpy(pTmpDocArray, pDocArray, nDocArray * sizeof(SSDOC_ITEM_TMP *));
SSDOC_ITEM_TMP result_common_doc[ BUCKET_VECTOR_BLOCK ]= {0};
ushort left=pDocList[0].length;
memcpy ( result_common_doc , pDocArray[0], left * sizeof ( SSDOC_ITEM_TMP) );
for(int k=0;k<left ;k++)
{
result_common_doc[k].tf = (((float)(pTmpDocArray[0]+k)->tf) / MAX_USHORT) * (*(pDocList[0].pfRtv));
}
for( int j=1;j<nDocArray;j++)
{
ushort right=pDocList[j].length;
// if(left + right > left * log ( right )/log (2) )
if(0)
{
int nCommon=0;
SSDOC_ITEM_TMP common_doc[ BUCKET_VECTOR_BLOCK ]= {0};
GetSameDocMergeSearch(nCommon , common_doc ,
result_common_doc,pDocArray[j], left ,right ,(*(pDocList[j].pfRtv)) );
if(0==nCommon)
{
return ;
}
else
{
left=nCommon;
memcpy ( result_common_doc , common_doc , nCommon * sizeof ( SSDOC_ITEM_TMP));
}
}
else
{
int nCommon=0;
SSDOC_ITEM_TMP common_doc[ BUCKET_VECTOR_BLOCK ]= {0};
GetSameDocBSearch(nCommon , common_doc ,result_common_doc ,pDocArray[j], left,right ,(*(pDocList[j].pfRtv)));
if(0==nCommon)
{
return ;
}
else
{
left=nCommon;
memcpy ( result_common_doc , common_doc , nCommon * sizeof ( SSDOC_ITEM_TMP));
}
}
}
nSameCount=left;
for(int kk=0;kk<left;kk++)
{
uint iDocID = aSameDoc[kk].iDocID = result_common_doc[kk].iDocID + BUCKET_VECTOR_BLOCK * sBucketPos;
aSameDoc[kk].tf = result_common_doc[kk].tf;
}
/*
{
ushort doc_count=pDocList[j]->iTotalDocCnt ;
if (doc_count > 0.1*MAX_DOC_COUNT)
{
uint iDocID = aSameDoc[nSameCount].iDocID = pTmpDocArray[j]->iDocID + BUCKET_VECTOR_BLOCK * sBucketPos;
ushort buc_pos =iDocID/256;
ushort buc_mod =iDocID%256;
lloffset[j] += pTmpDocArray[j]->nOccurs;
}
else
{
for(int ii=0;ii<pos ;i++)
{
lloffset[ii]+=(pTmpDocArray[j]+ii)->nOccurs;
}
}
}
*/
/*
while (pTmpDocArray[j]->iDocID < pTmpDocArray[j - 1]->iDocID)
{
lloffset[j] += pTmpDocArray[j]->nOccurs;
pTmpDocArray[j]++;
}
if (!pTmpDocArray[j]->nOccurs)
return;
*/
}
int main()
{
#define LENGTH 3
int nSameCount;
SSDOC_ITEM_TMP aSameDoc[MAX_USHORT ] ={0};
int nDocArray =LENGTH;
SSDOC_ITEM_TMP * pDocArray[LENGTH];
PAGE_SEARCH_ARG pArg[LENGTH];
TERM_DIC_HASH_VALUE_INTER pDocList[LENGTH]={0};
for(int i=0;i<nDocArray ;i++)
{
pDocList[i].pfRtv=(float *)calloc (1,sizeof (float ));
*(pDocList[i].pfRtv)=i+1;
pDocList[i].length=(i+2)*(i+2)*(i+2);
printf("length={%d}, pfRtv={ %f} ",pDocList[i].length, *(pDocList[i].pfRtv) );
}
printf("\n");
for(int i=0;i<nDocArray;i++)
{
printf("doc%d start \n",i);
pDocArray[i]=(SSDOC_ITEM_TMP *)calloc ( pDocList[i].length ,sizeof ( SSDOC_ITEM_TMP ));
for(int j=0;j< pDocList[i].length;j++)
{
pDocArray[i][j].iDocID=i+j+10;
pDocArray[i][j].tf=(j*j+2+i)%20;
printf("%d %f ", pDocArray[i][j].iDocID,pDocArray[i][j].tf);
}
printf("\n");
}
GetSameDocWrap ( 0 , nSameCount ,aSameDoc ,nDocArray , pDocArray , pArg , pDocList ,0);
printf("result \n");
for(int i=0;i<nSameCount;i++)
{
printf("%d %f \n", aSameDoc[i].iDocID,aSameDoc[i].tf);
}
}