答某道题,题见http://topic.youkuaiyun.com/u/20080424/11/886e2197-4bc7-42b1-a273-0c2915729f0b.html
解,代码如下:
/**/
/* * 要求:给一个字符串,求该字符串中重复次数最多的子串(可能有很多)里面的最长子串之一。 * 假设子串要求的最低长度为sl,通常令sl=2,即至少两个字符才算子串。 * * 分析:重复次数最多的子串中必包含长度为sl的子串,所以: * 1.以pow1=sl为子串长,快速求出最多的重复次数nr_max[pow1]; * 2.以pow1+pow2(init=1)为子串长, * 如果nr_max[pow1+pow2]==nr_max[pow1],则令pow1=pow1+pow2,pow2=pow2*2(指数增加),重复2. * 如果nr_max[pow1+pow2]<nr_max[pow1],如果pow2>1,令pow2=1,重复2.;如果pow2=1,则子串最长为pow1,结束。 * */
#include
<
iostream
>
#include
<
cstdlib
>
#include
<
ctime
>
#include
<
string
>
#include
<
windows.h
>
//
get time inteval
using
namespace
std;
#define
slice_line "+++++++++++++++++++++++++++++"
static
int
prime_array[]
=
...
{ // 哈希表常用取余的素数 17 , /**/ /* 0 */ 37 , /**/ /* 1 */ 79 , /**/ /* 2 */ 163 , /**/ /* 3 */ 331 , /**/ /* 4 */ 673 , /**/ /* 5 */ 1361 , /**/ /* 6 */ 2729 , /**/ /* 7 */ 5471 , /**/ /* 8 */ 10949 , /**/ /* 9 */ 21911 , /**/ /* 10 */ 43853 , /**/ /* 11 */ 87719 , /**/ /* 12 */ 175447 , /**/ /* 13 */ 350899 , /**/ /* 14 */ 701819 , /**/ /* 15 */ 1403641 , /**/ /* 16 */ 2807303 , /**/ /* 17 */ 5614657 , /**/ /* 18 */ 11229331 , /**/ /* 19 */ 22458671 , /**/ /* 20 */ 44917381 , /**/ /* 21 */ 89834777 , /**/ /* 22 */ 179669557 , /**/ /* 23 */ 359339171 , /**/ /* 24 */ 718678369 , /**/ /* 25 */ 1437356741 , /**/ /* 26 */ 2147483647 /**/ /* 27 (largest signed int prime) */ }
;
//
hash table
class
Hashtable
...
{ // ############################################## type define private : typedef struct Nr_substr ... { unsigned int nr; unsigned int ahash; Nr_substr * next; Nr_substr():nr( 0 ),ahash( 0 ),next( 0 ) ... {} ; } Nr_substr, * PNr_substr; typedef struct Pool ... { PNr_substr buf; int bufsize; int useness; bool full; Pool * next; Pool( int _bufsize):bufsize( 0 ),buf( 0 ),next( 0 ),full( 0 ),useness( 0 ) ... { bufsize = _bufsize; buf = new Nr_substr[bufsize]; } ~ Pool() ... { delete buf; } void clear() ... { memset(buf, 0 ,bufsize * sizeof (Nr_substr)); } }Pool, * PoolPtr; // ############################################### interface public : Hashtable( int _hashsize):pool_list( 0 ) ... { hashsize = _hashsize; poolsize = hashsize / 2 ; hashtable = new PNr_substr[hashsize]; memset(hashtable, 0 ,hashsize * sizeof (PNr_substr)); } ~ Hashtable() ... { delete pool_list; } int nr_str( int index,unsigned ahash) ... { if (hashtable[index] == NULL) ... { hashtable[index] = get_Nr_substr(); hashtable[index] -> ahash = ahash; return ( ++ hashtable[index] -> nr); } PNr_substr entry= hashtable[index]; while ( 1 ) ... { if (entry -> ahash == ahash) return ( ++ entry -> nr); if (entry -> next == NULL) ... { entry -> next = get_Nr_substr(); entry = entry -> next; entry -> ahash = ahash; return ( ++ entry -> nr); } entry= entry -> next; } return 0 ; } void clear() ... { PoolPtr p = pool_list; while (p) ... { p -> clear(); p = p -> next; } memset(hashtable,0 ,hashsize * sizeof (PNr_substr)); } PNr_substr get_Nr_substr()... { PoolPtr p = pool_list; while (p) ... { if (p -> full) p = p -> next; else break ; } if (p) ... { p -> full = (p -> useness + 1 == p -> bufsize); return & (p -> buf[p -> useness ++ ]); } else ... { p = new Pool(poolsize); p -> next = pool_list; pool_list = p; return & (p -> buf[p -> useness ++ ]); } } // ############################################# inner data private : PoolPtr pool_list; int poolsize; PNr_substr * hashtable; int hashsize; }
;
void
solve(
char
*
str,
int
lmin)
...
{ int strl = strlen(str); cout << slice_line << " -- src string len: " << strl << endl; cout << " -- sub string len>= " << lmin << endl; // 根据串长计算合适的hash_size int hash_size =- 4 ; while (strl = strl >> 1 ) hash_size ++ ; if (hash_size < 0 ) hash_size = 0 ; hash_size = prime_array[hash_size]; cout << " -- hash_size: " << hash_size << endl; Hashtable hashtable(hash_size); // 用于保存最“活跃”子串的位置 char * sub_str = NULL; // hash缓存 unsigned int phash = 0 ; unsigned int ahash = 0 ; int time_start = GetTickCount(); int nr = 0 ; // 零时计数 int nr_max = 0 ; // 子串最大重复次数 int count; // 重复nr_max次的子串的个数,当重复nr_max次的子串count>=1,而重复nr_max+1次的子串count==0时获解! char * s = NULL; // 零时字符指针 // 扫描获的子串重复最大数目 s = str + lmin - 1 ; while ( * s) ... { phash = 0 ; ahash = 0 ; for ( int i = 0 ;i < lmin;i ++ ) ... { phash = phash * ( * (s - i)) + ( * (s - i)); ahash = (ahash << 3 ) + ( * (s - i)); } nr= hashtable.nr_str(phash % hash_size,ahash); if (nr > nr_max) ... { nr_max = nr; sub_str = s; } s++ ; } int pow1 = lmin; int pow2 = 1 ; // 2^0 // 采用指数增长方式快速定位最长符合条件的最长子串之一 while ( 1 ) ... { hashtable.clear(); count = 0 ; s = str + pow1 + pow2 - 1 ; while ( * s) ... { phash = 0 ; ahash = 0 ; for ( int i = 0 ;i < pow1 + pow2;i ++ ) ... { phash = phash * ( * (s - i)) + ( * (s - i)); ahash = (ahash << 3 ) + ( * (s - i)); } nr= hashtable.nr_str(phash % hash_size,ahash); if (nr == nr_max) ... { sub_str = s; pow1 = pow1 + pow2; pow2 = pow2 << 2 ; continue ; } s++ ; } if (pow2 > 1 ) ... { pow2 = 1 ; continue ; } else break ; // ok,we get it! } int time_end = GetTickCount(); cout << slice_line << " -- time elapse: " << time_end - time_start << " ms " << endl; cout << " -- sub string max nr : " << nr_max << endl; char * sub_str_m = new char [pow1 + 1 ]; strncpy(sub_str_m,sub_str - pow1 + 1 ,pow1); sub_str_m[pow1] = 0 ; cout << " -- sub string is : " << sub_str_m << " " << slice_line << " " ; }
//
test str
typedef
struct
TestStr
...
{ enum ... { saltnum = 50 ,size = 10000 } ; char repeat_str[ 52 * size + 1 ]; // 默认为52 0000个字符 TestStr() ... { create_str(); } void create_str() ... { for ( int i = 0 ;i < size;i ++ ) ... { for ( int j1 = 0 ;j1 < 26 ;j1 ++ ) repeat_str[i * 52 + j1] = ( char )( ' a ' + j1); for ( int j2 = 26 ;j2 < 52 ;j2 ++ ) repeat_str[i * 52 + j2] = ( char )( ' A ' + j2 - 26 ); } repeat_str[52 * size] = 0 ; // 添加salt,添加size/2个杂质(简化问题,我放入空格) srand(time(NULL) + (seed ++ )); int repeat = (size / 4 < saltnum) ? (size / 4 ):saltnum; for ( int k = 0 ;k < repeat;k ++ ) repeat_str[rand() % ( 52 * size)] = ' ' ; } static int seed; }
TestStr;
int
TestStr::seed
=
0
;
int
main()
...
{ TestStr test_str; // cout<<test_str.repeat_str<<endl; solve(test_str.repeat_str, 2 ); test_str.create_str(); solve(test_str.repeat_str, 2 ); test_str.create_str(); solve(test_str.repeat_str, 2 ); test_str.create_str(); solve(test_str.repeat_str, 2 ); return 0 ; }
输出如下:
+++++++++++++++++++++++++++++ -- src string len: 520000 -- sub string len>=2 -- hash_size: 350899 +++++++++++++++++++++++++++++ -- time elapse: 188ms -- sub string max nr : 10000 -- sub string is : efg +++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++ -- src string len: 520000 -- sub string len>=2 -- hash_size: 350899 +++++++++++++++++++++++++++++ -- time elapse: 188ms -- sub string max nr : 10000 -- sub string is : rst +++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++ -- src string len: 520000 -- sub string len>=2 -- hash_size: 350899 +++++++++++++++++++++++++++++ -- time elapse: 188ms -- sub string max nr : 10000 -- sub string is : QRS +++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++ -- src string len: 520000 -- sub string len>=2 -- hash_size: 350899 +++++++++++++++++++++++++++++ -- time elapse: 266ms -- sub string max nr : 10000 -- sub string is : cdef +++++++++++++++++++++++++++++