原子化字符串

最新推荐文章于 2025-04-12 11:20:16 发布

原创最新推荐文章于 2025-04-12 11:20:16 发布 · 1.4k 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#string #iterator #delete #null #emacs #lisp

工作专栏收录该内容

28 篇文章

订阅专栏

本文介绍了一个简单的原子化字符串管理实现，通过使用哈希表来提高符号和文件名管理的效率。该实现在需要频繁查找字符串的应用场景下尤其有用，如编译器开发等。

在编译器或者其它语言处理软件的开发过程中，字符串管理是非常重要的。
原子化字符串可以提升符号管理和文件名管理。

下面是区区编写的一个简单实现，intern一词是来源于emacs lisp的intern函数：

///////////////////////Pimpl模式声明
class AtomsImpl;
class Atoms{
public:
    const char * intern(const string &s);
    const char * intern(const char *s);
    Atoms();
    ~Atoms();
private:
    AtomsImpl *pimpl;
};



///////////////////////Atoms string container实现

struct AtomsImplNode{
    size_t len;
    size_t hash_val;
    char * s;
    AtomsImplNode * next;
};

class AtomsImpl{
public:
    const char * intern(const string &s);
    const char * intern(const char *s);
    AtomsImpl();
    ~AtomsImpl();
private:
    vector<AtomsImplNode*> buckets;
    size_t atom_count;
};

AtomsImpl::AtomsImpl():buckets(1), atom_count(0){
}

AtomsImpl::~AtomsImpl(){
    vector<AtomsImplNode*>::iterator beg = buckets.begin(), end = buckets.end();
    for(; beg!=end; ++beg) {
        AtomsImplNode *head = *beg, *last;
        while(head) {
            last = head;
            head = head->next;
            delete [] last->s;
            delete last;
        }
    }
}

const char *AtomsImpl::intern(const string &s){
    const char *ret = NULL;
    if (atom_count > buckets.size()) { //rehash
        size_t new_buckets_size = buckets.size() * 2;
        vector<AtomsImplNode*> new_buckets(new_buckets_size);
        vector<AtomsImplNode*>::iterator beg = buckets.begin(), end = buckets.end();
        for(; beg!=end; ++beg) {
            AtomsImplNode *head = *beg, *last;
            while(head) {
                last = head;
                head = head->next;
                size_t idx = last->hash_val % new_buckets_size;
                if (new_buckets[idx]) {
                    AtomsImplNode *tail = new_buckets[idx];
                    while(tail->next)
                        tail = tail->next;
                    tail->next = last;
                    last->next = NULL;
                }else{
                    new_buckets[idx] = last;
                    last->next = NULL;
                }
            }
        }
        buckets.swap(new_buckets);
    }
    size_t hash_val = 7, len = s.size(), hidx = 0;
    while(hidx<len){
        hash_val = hash_val * 31 + s[hidx++];
    }
    size_t idx = hash_val % buckets.size();
    AtomsImplNode *tail = NULL; //tricky to reduce insert code
    if (buckets[idx]){
        AtomsImplNode *head = buckets[idx];
        while(head){
            if (head->len == len &&
                head->hash_val == hash_val &&
                !strcmp(s.c_str(), head->s))
                return head->s;
            tail = head;
            head = head->next;
        }
    }
    AtomsImplNode *newNode = new AtomsImplNode;
    newNode->len = len;
    newNode->hash_val = hash_val;
    newNode->next = NULL;
    newNode->s = new char[len+1];
    strcpy(newNode->s, s.c_str());
    if (tail){
        tail->next = newNode;
    } else {
        buckets[idx] = newNode;
    }
    ++atom_count;
    return newNode->s;

}
const char *AtomsImpl::intern(const char *s){
    return intern(string(s));
}

Atoms::Atoms(){
    pimpl = new AtomsImpl;
}
Atoms::~Atoms(){
    delete pimpl;
}
const char * Atoms::intern(const string &s){
    return pimpl->intern(s);
}
const char * Atoms::intern(const char  *s){
    return pimpl->intern(s);
}