Tech Tips: C++でハッシュのマップを実装する

2011年7月14日木曜日

C++でハッシュのマップを実装する

C++のset/mapは、二分木で実装されている。要素数が大きくなると重くなってしまうので（平衡木の回転の処理が重たい？）、ハッシュを使ってmapみたいなものを実装してみた（挿入、参照のみ）。

template <class K, class V>
class hashMap {
    static const unsigned int DEFAULT_SIZE = 1000000;
    vector<pair<K, V> > *contents;
    unsigned int _size;
    unsigned int (*_hashValue) (K, unsigned int);

public:
    hashMap(unsigned int (*func) (K, unsigned int) , unsigned int size = DEFAULT_SIZE) {
        _hashValue = func;
        _size = size;
        contents = new vector<pair<K, V> > [_size];
    }

    ~hashMap() {
        delete [] contents;
    }

    bool hashElement(K s) {
        unsigned int h = _hashValue(s, _size);

        for (int i = 0; i < (int)contents[h].size(); i++) {
            if (contents[h][i].first == s)
                return true;
        }

        return false;
    }

    void insert(K x, V n) {
        if (hashElement(x))
            return;

        unsigned int h = _hashValue(x, _size);
        contents[h].push_back(make_pair(x, n));
    }

    pair<K, V> *getElement(K x) {
        unsigned int h = _hashValue(x, _size);
        for (int i = 0; i < (int)contents[h].size(); i++)
            if (contents[h][i].first == x)
                return &contents[h][i];

        return NULL;
    }

    void printHashBalance() {
        int cnt = 0;
        double ret = 0.0;
        int worst = 0;

        for (int i = 0; i < (int)_size; i++) {
            if (contents[i].size()) {
                ++cnt;
                ret += contents[i].size();
                worst = max<int>(worst, contents[i].size());
            }
        }

        if (!cnt)
            cerr << "Hash is empty" << endl;
        else {
            cerr << "collisions occurred(average): " << (ret/cnt) << endl;
            cerr << "collisions occurred(worst): " << worst << endl;
        }
    }
};

これで、参照/挿入はO(1)でできるはず。衝突がどれくらい起きているかを調べるためにprintHashBalance()という関数を定義した。コンストラクタには、使用するハッシュ関数のポインタと、ハッシュ値のサイズを渡す。

適当にテストコードを書いて、標準関数のmapと比較する。WORDS_NUM個のランダムな文字列を、それぞれのコンテナにぶちこんで、最頻出の文字列を求める。

#define WORDS_NUM 10000

/*
* generate random strings of length between 3 and 8 inclusive
*/
vector<string> randomGen() {
    vector<string> ret;

    for (int i = 0; i < WORDS_NUM; i++) {
        int len = 3 + rand() % 6;

        char str[len+1];
        for (int j = 0; j < len; j++)
            str[j] = 97 + rand() % 26;
        str[len] = '\0';

        ret.push_back(str);
    }

    return ret;
}
/*
* timer
*/
class Timer {
    double start_t;
public:
    void start() {
        cout << "Timer Start." << endl;
        start_t = gettimeofday_sec();
    }

    void end() {
        cout << "Timer stopped." << endl;
        cout << "Time Elapsed: " << gettimeofday_sec() - start_t << endl;
    }
};

//#define _MAP
#define _HASH

unsigned int stringHash(string str, unsigned int sz) {
    unsigned int ret = 0;

    for (int i = 0; i < (int)str.size(); i++)
        ret = 26 * ret + str[i] - 97;

    return ret % sz;
}

int main() {
    vector<string> words = randomGen();
    Timer timer;
    timer.start();

#ifdef _MAP
    map<string, int> wc;
    int cnt = 0;
    string best;

    for (int i = 0; i < (int)words.size(); i++) {
        if (wc.count(words[i])) {
            int n = ++wc[words[i]];

            if (n > cnt) {
                cnt = n;
                best = words[i];
            }
        }
        else
            wc[words[i]] = 1;
    }
    cout << cnt << ": " << best << endl;
#endif

#ifdef _HASH
    hashMap<string, int> wc(stringHash, WORDS_NUM);
    int cnt = 0;
    string best;

    for (int i = 0; i < (int)words.size(); i++) {
        if (wc.hashElement(words[i])) {
            int n = ++wc.getElement(words[i])->second;

            if (n > cnt) {
                cnt = n;
                best = words[i];
            }
        }
        else
            wc.insert(words[i], 1);
    }
    cout << cnt << ":" << best << endl;
    wc.printHashBalance();
#endif

    timer.end();

    return 0;
}

以下比較結果。HashMapの_sizeは入力文字数と同じサイズにした。一つのハッシュ値に格納される入力値の種類は平均1.45くらいだった。

WORDS_NUM	100,000	1,000,000	10,000,000
Map [s]	0.346	4.999	66.71
HashMap [s]	0.2	2.052	20.274

もっと速くなると思ったが、どこがネックなんだろう。オーダー的にはハッシュ値の計算が重いけど、実際はvectorのメモリ関連処理が遅そうな気がする。プロファイルとってみるかー。

Tech Tips

Page List

Search on the blog

2011年7月14日木曜日

C++でハッシュのマップを実装する

0 件のコメント:

コメントを投稿

Blogger Syntax Highliter