最近在一个项目中需要针对上百万条(大约在800W条)的字符串进行相关的处理。该字符串是以文本的形式存放在本地硬盘,并且更新频率为20分钟一次。
具体需求:
1、判断某一个字符串是否存在这800W条字符串中,时间要求在5ms以内
2、根据字符串前缀,返回包含此前缀的10-100条字符串。时间要求在5ms以内
3、占用内存要小
本人接到此需求第一反应用List,或者二叉树啥的。结果效率慢得死,
后经过大量资料查阅,发现采用DAWG(Directed Acyclic Word Graph)和Trie可以很快速的查找到相应的字符串。
主要思路:将每个字符串的字符进行创建相应的Node,如果有相同的字符则放置在同一个节点中。
例如有6个字符串:Top, Tops, Tap, Tapc, Topa and Taps。根据DAWG和Trie的原理将创建成如下一个Tree
在添加字符串的时候,根据字符串的字符创建树形结构,每个节点代表一个字符。如果是相同字符时,则存储在同一个节点中。这样可少占用内存。
在进行判断字符串是否存在时,只需要依次判断字符串的节点是否存在。
首先创建一个字符节点类LetterNode. 该类主要用于存储相关字符节点的信息:子节点,父节点等等。


///
<summary>
/// The Dawg's letter node
/// </summary>
internal sealed class LetterNode
{
private const int InitialSpreadCapacity = 4 ;
private Dictionary < char , LetterNode > _childNodes;
/// <summary>
/// Initializes a new instance of the <see cref="LetterNode"/> class.
/// </summary>
public LetterNode()
{
}
/// <summary>
/// Initializes a new instance of the <see cref="LetterNode"/> class.
/// </summary>
/// <param name="letter"> The letter. </param>
public LetterNode( char letter)
: this ()
{
Letter = letter;
}
/// <summary>
/// The node's child nodes
/// </summary>
public Dictionary < char , LetterNode > ChildNodes
{
get
{
if ( this ._childNodes == null )
{
this ._childNodes = new Dictionary < char , LetterNode > (InitialSpreadCapacity);
}
return this ._childNodes;
}
}
/// <summary>
/// Describe the node is end of the word.
/// </summary>
public bool IsEndOfWord
{
get ;
set ;
}
/// <summary>
/// Gets or sets the letter.
/// </summary>
/// <value> The letter. </value>
public char Letter
{
get ;
set ;
}
/// <summary>
/// Gets or sets the parent of this node.
/// </summary>
/// <value> The parent. </value>
public LetterNode Parent
{
get ;
set ;
}
/// <summary>
/// Gets the word defined at this element.
/// </summary>
/// <value> The word ending here (if this is a word, or the empty string. </value>
public string Word
{
get
{
if (IsEndOfWord)
{
StringBuilder sb = new StringBuilder( 20 );
sb.Append(Letter);
var node = Parent;
while (node != null )
{
sb.Insert( 0 , node.Letter);
node = node.Parent;
}
return sb.ToString();
}
else
{
return string .Empty;
}
}
}
}
/// The Dawg's letter node
/// </summary>
internal sealed class LetterNode
{
private const int InitialSpreadCapacity = 4 ;
private Dictionary < char , LetterNode > _childNodes;
/// <summary>
/// Initializes a new instance of the <see cref="LetterNode"/> class.
/// </summary>
public LetterNode()
{
}
/// <summary>
/// Initializes a new instance of the <see cref="LetterNode"/> class.
/// </summary>
/// <param name="letter"> The letter. </param>
public LetterNode( char letter)
: this ()
{
Letter = letter;
}
/// <summary>
/// The node's child nodes
/// </summary>
public Dictionary < char , LetterNode > ChildNodes
{
get
{
if ( this ._childNodes == null )
{
this ._childNodes = new Dictionary < char , LetterNode > (InitialSpreadCapacity);
}
return this ._childNodes;
}
}
/// <summary>
/// Describe the node is end of the word.
/// </summary>
public bool IsEndOfWord
{
get ;
set ;
}
/// <summary>
/// Gets or sets the letter.
/// </summary>
/// <value> The letter. </value>
public char Letter
{
get ;
set ;
}
/// <summary>
/// Gets or sets the parent of this node.
/// </summary>
/// <value> The parent. </value>
public LetterNode Parent
{
get ;
set ;
}
/// <summary>
/// Gets the word defined at this element.
/// </summary>
/// <value> The word ending here (if this is a word, or the empty string. </value>
public string Word
{
get
{
if (IsEndOfWord)
{
StringBuilder sb = new StringBuilder( 20 );
sb.Append(Letter);
var node = Parent;
while (node != null )
{
sb.Insert( 0 , node.Letter);
node = node.Parent;
}
return sb.ToString();
}
else
{
return string .Empty;
}
}
}
}
接下来实现Dawg针对字符串创建树形节点。思路是参照上图。
///
<summary>
/// Adds the specified item.
/// </summary>
/// <param name="item"> The item. </param>
public void Add( string item)
{
if ( string .IsNullOrEmpty(item))
{
return ;
}
item = item.ToLowerInvariant();
LetterNode node = null , parentNode = null ;
if ( ! this ._rootNodes.TryGetValue(item[ 0 ], out node))
{
node = new LetterNode(item[ 0 ]);
this ._rootNodes[item[ 0 ]] = node;
}
for ( int i = 1 ; i < item.Length; ++ i)
{
parentNode = node;
if ( ! node.ChildNodes.TryGetValue(item[i], out node))
{
node = new LetterNode(item[i]);
node.Parent = parentNode;
parentNode.ChildNodes[item[i]] = node;
}
}
if ( ! node.IsEndOfWord)
{
node.IsEndOfWord = true ;
this ._count ++ ;
}
}
/// Adds the specified item.
/// </summary>
/// <param name="item"> The item. </param>
public void Add( string item)
{
if ( string .IsNullOrEmpty(item))
{
return ;
}
item = item.ToLowerInvariant();
LetterNode node = null , parentNode = null ;
if ( ! this ._rootNodes.TryGetValue(item[ 0 ], out node))
{
node = new LetterNode(item[ 0 ]);
this ._rootNodes[item[ 0 ]] = node;
}
for ( int i = 1 ; i < item.Length; ++ i)
{
parentNode = node;
if ( ! node.ChildNodes.TryGetValue(item[i], out node))
{
node = new LetterNode(item[i]);
node.Parent = parentNode;
parentNode.ChildNodes[item[i]] = node;
}
}
if ( ! node.IsEndOfWord)
{
node.IsEndOfWord = true ;
this ._count ++ ;
}
}
判断字符串是否存在:
private
LetterNode SearchPrefixLetterNode(
string
prefix)
{
prefix = prefix.ToLowerInvariant();
Dictionary < char , LetterNode > nodes = this ._rootNodes;
LetterNode node = null ;
for ( int i = 0 ; i < prefix.Length; ++ i)
{
if (nodes.TryGetValue(prefix[i], out node))
{
nodes = node.ChildNodes;
}
else
{
return null ;
}
}
return node;
}
{
prefix = prefix.ToLowerInvariant();
Dictionary < char , LetterNode > nodes = this ._rootNodes;
LetterNode node = null ;
for ( int i = 0 ; i < prefix.Length; ++ i)
{
if (nodes.TryGetValue(prefix[i], out node))
{
nodes = node.ChildNodes;
}
else
{
return null ;
}
}
return node;
}
public
bool
Contains(
string
item)
{
var node = this .SearchPrefixLetterNode(item);
return node != null && node.IsEndOfWord;
}
{
var node = this .SearchPrefixLetterNode(item);
return node != null && node.IsEndOfWord;
}