哈希

哈希基本概念

哈希（Hash）是一种将任意长度的输入数据映射为固定长度输出的技术。这个过程通过哈希函数实现，输出结果称为哈希值、散列值或摘要。

核心特性

确定性：相同输入总是产生相同输出
高效性：计算速度快
均匀分布：输出值在值域内均匀分布
雪崩效应：输入的微小变化导致输出的巨大变化

哈希函数

哈希函数（Hash Function）是实现哈希映射的核心算法，将输入数据转换为哈希值。

数学定义


xxxxxxxxxx
1
1
h: U → {0, 1, 2, ..., m-1}

其中：

U 是所有可能键的全集
m 是哈希表的大小
h(k) 是键 k 的哈希值

常见哈希函数类型

除法散列法（Division Method）


xxxxxxxxxx
6
1
int division_hash(int key, int table_size) {
2
    return key % table_size;
3
}
4

5
// 示例
6
int hash_value = division_hash(123, 10); // 结果: 3

优点：简单快速

缺点：表大小选择很重要，最好选择质数

乘法散列法（Multiplication Method）


xxxxxxxxxx
9
1
int multiplication_hash(int key, int table_size) {
2
    const double A = 0.6180339887; // 黄金比例的小数部分
3
    double temp = key * A;
4
    temp = temp - floor(temp); // 取小数部分
5
    return (int)(table_size * temp);
6
}
7

8
// 示例
9
int hash_value = multiplication_hash(123, 10);

优点：对表大小不敏感缺点：涉及浮点运算，可能较慢

平方取中法


xxxxxxxxxx
9
1
int square_hash(int key, int table_size) {
2
    long long squared = (long long)key * key;
3
    // 取中间几位数字
4
    int digits = to_string(squared).length();
5
    int start = digits / 4;
6
    int end = digits - start;
7
    string middle = to_string(squared).substr(start, end - start);
8
    return stoi(middle) % table_size;
9
}

字符串哈希函数

多项式滚动哈希


xxxxxxxxxx
26
1
class StringHash {
2
private:
3
    static const int BASE = 31;
4
    static const int MOD = 1e9 + 7;
5
    
6
public:
7
    int polynomial_hash(const string& str) {
8
        long long hash_value = 0;
9
        long long base_power = 1;
10
        
11
        for (char c : str) {
12
            hash_value = (hash_value + (c - 'a' + 1) * base_power) % MOD;
13
            base_power = (base_power * BASE) % MOD;
14
        }
15
        return hash_value;
16
    }
17
    
18
    // 优化版本：避免重复计算
19
    int optimized_hash(const string& str) {
20
        long long hash_value = 0;
21
        for (char c : str) {
22
            hash_value = (hash_value * BASE + (c - 'a' + 1)) % MOD;
23
        }
24
        return hash_value;
25
    }
26
};

DJB2 哈希


xxxxxxxxxx
7
1
int djb2_hash(const string& str) {
2
    unsigned long hash = 5381;
3
    for (char c : str) {
4
        hash = ((hash << 5) + hash) + c; // hash * 33 + c
5
    }
6
    return hash;
7
}

FNV 哈希


xxxxxxxxxx
11
1
int fnv_hash(const string& str) {
2
    const unsigned int FNV_PRIME = 16777619;
3
    const unsigned int FNV_OFFSET_BASIS = 2166136261;
4
    
5
    unsigned int hash = FNV_OFFSET_BASIS;
6
    for (char c : str) {
7
        hash ^= c;
8
        hash *= FNV_PRIME;
9
    }
10
    return hash;
11
}

密码学哈希函数

MD5


xxxxxxxxxx
14
1
#include <openssl/md5.h>
2
#include <iomanip>
3
#include <sstream>
4

5
string md5_hash(const string& input) {
6
    unsigned char digest[MD5_DIGEST_LENGTH];
7
    MD5((unsigned char*)input.c_str(), input.length(), digest);
8
    
9
    stringstream ss;
10
    for (int i = 0; i < MD5_DIGEST_LENGTH; i++) {
11
        ss << hex << setw(2) << setfill('0') << (int)digest[i];
12
    }
13
    return ss.str();
14
}

SHA-256


xxxxxxxxxx
12
1
#include <openssl/sha.h>
2

3
string sha256_hash(const string& input) {
4
    unsigned char digest[SHA256_DIGEST_LENGTH];
5
    SHA256((unsigned char*)input.c_str(), input.length(), digest);
6
    
7
    stringstream ss;
8
    for (int i = 0; i < SHA256_DIGEST_LENGTH; i++) {
9
        ss << hex << setw(2) << setfill('0') << (int)digest[i];
10
    }
11
    return ss.str();
12
}

哈希函数的质量评估

均匀性测试


xxxxxxxxxx
29
1
class HashQualityTester {
2
public:
3
    void test_uniformity(function<int(int)> hash_func, int table_size, int test_count) {
4
        vector<int> buckets(table_size, 0);
5
        
6
        // 生成测试数据
7
        for (int i = 0; i < test_count; i++) {
8
            int hash_value = hash_func(i) % table_size;
9
            buckets[hash_value]++;
10
        }
11
        
12
        // 计算统计信息
13
        double expected = (double)test_count / table_size;
14
        double chi_square = 0.0;
15
        
16
        for (int count : buckets) {
17
            double diff = count - expected;
18
            chi_square += (diff * diff) / expected;
19
        }
20
        
21
        cout << "Chi-square value: " << chi_square << endl;
22
        cout << "Expected frequency: " << expected << endl;
23
        
24
        // 打印分布
25
        for (int i = 0; i < table_size; i++) {
26
            cout << "Bucket " << i << ": " << buckets[i] << endl;
27
        }
28
    }
29
};

哈希冲突

当两个不同的键产生相同的哈希值时，就发生了哈希冲突（Hash Collision）。这是不可避免的现象（鸽笼原理）。

冲突产生的原因

鸽笼原理


xxxxxxxxxx
2
1
如果有 n 个鸽笼和 m 只鸽子，且 m > n，
2
那么至少有一个鸽笼包含多于一只鸽子。

在哈希表中：

鸽笼 = 哈希表的桶
鸽子 = 要存储的键
当键的数量 > 桶的数量时，必然发生冲突

冲突解决方法

链地址法（Separate Chaining）

每个桶维护一个链表存储所有哈希到该位置的元素。


xxxxxxxxxx
125
1
template<typename K, typename V>
2
class HashTableChaining {
3
private:
4
    struct Node {
5
        K key;
6
        V value;
7
        Node* next;
8
        Node(const K& k, const V& v) : key(k), value(v), next(nullptr) {}
9
    };
10
    
11
    vector<Node*> table;
12
    int size;
13
    int count;
14
    
15
public:
16
    HashTableChaining(int initial_size = 16) : size(initial_size), count(0) {
17
        table.resize(size, nullptr);
18
    }
19
    
20
    ~HashTableChaining() {
21
        clear();
22
    }
23
    
24
    int hash_function(const K& key) {
25
        return std::hash<K>{}(key) % size;
26
    }
27
    
28
    void insert(const K& key, const V& value) {
29
        int index = hash_function(key);
30
        Node* current = table[index];
31
        
32
        // 检查是否已存在
33
        while (current) {
34
            if (current->key == key) {
35
                current->value = value; // 更新值
36
                return;
37
            }
38
            current = current->next;
39
        }
40
        
41
        // 插入新节点
42
        Node* new_node = new Node(key, value);
43
        new_node->next = table[index];
44
        table[index] = new_node;
45
        count++;
46
        
47
        // 检查是否需要扩容
48
        if (load_factor() > 0.75) {
49
            resize();
50
        }
51
    }
52
    
53
    bool search(const K& key, V& value) {
54
        int index = hash_function(key);
55
        Node* current = table[index];
56
        
57
        while (current) {
58
            if (current->key == key) {
59
                value = current->value;
60
                return true;
61
            }
62
            current = current->next;
63
        }
64
        return false;
65
    }
66
    
67
    bool remove(const K& key) {
68
        int index = hash_function(key);
69
        Node* current = table[index];
70
        Node* prev = nullptr;
71
        
72
        while (current) {
73
            if (current->key == key) {
74
                if (prev) {
75
                    prev->next = current->next;
76
                } else {
77
                    table[index] = current->next;
78
                }
79
                delete current;
80
                count--;
81
                return true;
82
            }
83
            prev = current;
84
            current = current->next;
85
        }
86
        return false;
87
    }
88
    
89
    double load_factor() {
90
        return (double)count / size;
91
    }
92
    
93
private:
94
    void resize() {
95
        vector<Node*> old_table = table;
96
        int old_size = size;
97
        
98
        size *= 2;
99
        count = 0;
100
        table.clear();
101
        table.resize(size, nullptr);
102
        
103
        // 重新插入所有元素
104
        for (int i = 0; i < old_size; i++) {
105
            Node* current = old_table[i];
106
            while (current) {
107
                Node* next = current->next;
108
                insert(current->key, current->value);
109
                delete current;
110
                current = next;
111
            }
112
        }
113
    }
114
    
115
    void clear() {
116
        for (int i = 0; i < size; i++) {
117
            Node* current = table[i];
118
            while (current) {
119
                Node* next = current->next;
120
                delete current;
121
                current = next;
122
            }
123
        }
124
    }
125
};

开放地址法（Open Addressing）

所有元素都存储在哈希表本身中，冲突时寻找下一个可用位置。

线性探测（Linear Probing）


xxxxxxxxxx
108
1
template<typename K, typename V>
2
class HashTableLinearProbing {
3
private:
4
    struct Entry {
5
        K key;
6
        V value;
7
        bool occupied;
8
        bool deleted; // 标记删除，用于处理删除操作
9
        
10
        Entry() : occupied(false), deleted(false) {}
11
        Entry(const K& k, const V& v) : key(k), value(v), occupied(true), deleted(false) {}
12
    };
13
    
14
    vector<Entry> table;
15
    int size;
16
    int count;
17
    
18
public:
19
    HashTableLinearProbing(int initial_size = 16) : size(initial_size), count(0) {
20
        table.resize(size);
21
    }
22
    
23
    int hash_function(const K& key) {
24
        return std::hash<K>{}(key) % size;
25
    }
26
    
27
    void insert(const K& key, const V& value) {
28
        if (load_factor() >= 0.75) {
29
            resize();
30
        }
31
        
32
        int index = hash_function(key);
33
        int original_index = index;
34
        
35
        while (table[index].occupied && !table[index].deleted) {
36
            if (table[index].key == key) {
37
                table[index].value = value; // 更新值
38
                return;
39
            }
40
            index = (index + 1) % size;
41
            
42
            // 防止无限循环
43
            if (index == original_index) {
44
                resize();
45
                insert(key, value);
46
                return;
47
            }
48
        }
49
        
50
        table[index] = Entry(key, value);
51
        count++;
52
    }
53
    
54
    bool search(const K& key, V& value) {
55
        int index = hash_function(key);
56
        int original_index = index;
57
        
58
        while (table[index].occupied || table[index].deleted) {
59
            if (table[index].occupied && !table[index].deleted && table[index].key == key) {
60
                value = table[index].value;
61
                return true;
62
            }
63
            index = (index + 1) % size;
64
            
65
            if (index == original_index) break;
66
        }
67
        return false;
68
    }
69
    
70
    bool remove(const K& key) {
71
        int index = hash_function(key);
72
        int original_index = index;
73
        
74
        while (table[index].occupied || table[index].deleted) {
75
            if (table[index].occupied && !table[index].deleted && table[index].key == key) {
76
                table[index].deleted = true;
77
                table[index].occupied = false;
78
                count--;
79
                return true;
80
            }
81
            index = (index + 1) % size;
82
            
83
            if (index == original_index) break;
84
        }
85
        return false;
86
    }
87
    
88
    double load_factor() {
89
        return (double)count / size;
90
    }
91
    
92
private:
93
    void resize() {
94
        vector<Entry> old_table = table;
95
        int old_size = size;
96
        
97
        size *= 2;
98
        count = 0;
99
        table.clear();
100
        table.resize(size);
101
        
102
        for (int i = 0; i < old_size; i++) {
103
            if (old_table[i].occupied && !old_table[i].deleted) {
104
                insert(old_table[i].key, old_table[i].value);
105
            }
106
        }
107
    }
108
};

二次探测（Quadratic Probing）


xxxxxxxxxx
18
1
class QuadraticProbing {
2
public:
3
    int probe(int key, int attempt, int table_size) {
4
        int h = key % table_size;
5
        return (h + attempt * attempt) % table_size;
6
    }
7
    
8
    // 改进的二次探测：使用 +/- 模式
9
    int improved_probe(int key, int attempt, int table_size) {
10
        int h = key % table_size;
11
        int offset = (attempt + 1) / 2;
12
        if (attempt % 2 == 1) {
13
            return (h + offset * offset) % table_size;
14
        } else {
15
            return (h - offset * offset + table_size) % table_size;
16
        }
17
    }
18
};

双重哈希（Double Hashing）


xxxxxxxxxx
15
1
class DoubleHashing {
2
public:
3
    int hash1(int key, int table_size) {
4
        return key % table_size;
5
    }
6
    
7
    int hash2(int key, int table_size) {
8
        // 第二个哈希函数，结果不能为0
9
        return 7 - (key % 7);
10
    }
11
    
12
    int probe(int key, int attempt, int table_size) {
13
        return (hash1(key, table_size) + attempt * hash2(key, table_size)) % table_size;
14
    }
15
};

再哈希法（Rehashing）

使用多个哈希函数，当第一个产生冲突时使用第二个，以此类推。


xxxxxxxxxx
40
1
class RehashingTable {
2
private:
3
    vector<int> hash_functions;
4
    vector<vector<pair<int, string>>> table;
5
    int size;
6
    
7
public:
8
    RehashingTable(int table_size) : size(table_size) {
9
        table.resize(size);
10
        // 初始化多个哈希函数的参数
11
        hash_functions = {31, 37, 41, 43, 47};
12
    }
13
    
14
    int hash(int key, int func_index) {
15
        return (key * hash_functions[func_index]) % size;
16
    }
17
    
18
    void insert(int key, const string& value) {
19
        for (int i = 0; i < hash_functions.size(); i++) {
20
            int index = hash(key, i);
21
            
22
            // 检查是否已存在
23
            for (auto& pair : table[index]) {
24
                if (pair.first == key) {
25
                    pair.second = value;
26
                    return;
27
                }
28
            }
29
            
30
            // 如果这个位置为空或冲突较少，插入这里
31
            if (table[index].size() < 3) { // 阈值可调
32
                table[index].push_back({key, value});
33
                return;
34
            }
35
        }
36
        
37
        // 所有哈希函数都冲突严重，使用第一个
38
        table[hash(key, 0)].push_back({key, value});
39
    }
40
};

冲突处理

布隆过滤器（Bloom Filter）


xxxxxxxxxx
38
1
class BloomFilter {
2
private:
3
    vector<bool> bit_array;
4
    int size;
5
    int hash_count;
6
    
7
public:
8
    BloomFilter(int bit_size, int num_hashes) 
9
        : size(bit_size), hash_count(num_hashes) {
10
        bit_array.resize(size, false);
11
    }
12
    
13
    void add(const string& item) {
14
        for (int i = 0; i < hash_count; i++) {
15
            int hash_val = hash_function(item, i) % size;
16
            bit_array[hash_val] = true;
17
        }
18
    }
19
    
20
    bool might_contain(const string& item) {
21
        for (int i = 0; i < hash_count; i++) {
22
            int hash_val = hash_function(item, i) % size;
23
            if (!bit_array[hash_val]) {
24
                return false; // 绝对不存在
25
            }
26
        }
27
        return true; // 可能存在
28
    }
29
    
30
private:
31
    int hash_function(const string& item, int seed) {
32
        int hash = 0;
33
        for (char c : item) {
34
            hash = hash * 31 + c + seed;
35
        }
36
        return abs(hash);
37
    }
38
};

一致性哈希（Consistent Hashing）


xxxxxxxxxx
37
1
class ConsistentHash {
2
private:
3
    map<int, string> ring; // 哈希环
4
    int virtual_nodes;
5
    
6
public:
7
    ConsistentHash(int vnodes = 150) : virtual_nodes(vnodes) {}
8
    
9
    void add_node(const string& node) {
10
        for (int i = 0; i < virtual_nodes; i++) {
11
            string vnode = node + "#" + to_string(i);
12
            int hash = std::hash<string>{}(vnode);
13
            ring[hash] = node;
14
        }
15
    }
16
    
17
    void remove_node(const string& node) {
18
        for (int i = 0; i < virtual_nodes; i++) {
19
            string vnode = node + "#" + to_string(i);
20
            int hash = std::hash<string>{}(vnode);
21
            ring.erase(hash);
22
        }
23
    }
24
    
25
    string get_node(const string& key) {
26
        if (ring.empty()) return "";
27
        
28
        int hash = std::hash<string>{}(key);
29
        auto it = ring.lower_bound(hash);
30
        
31
        if (it == ring.end()) {
32
            it = ring.begin(); // 环形结构
33
        }
34
        
35
        return it->second;
36
    }
37
};

总结

哈希函数选择指南

应用场景	推荐哈希函数	特点
哈希表	除法散列、乘法散列	快速、简单
字符串处理	多项式滚动哈希	支持增量计算
密码学	SHA-256, SHA-3	安全性高
校验和	CRC32, MD5	检错能力强
分布式系统	一致性哈希	负载均衡

冲突处理方法比较

方法	优点	缺点	适用场景
链地址法	实现简单，负载因子可>1	额外内存开销	内存充足时
线性探测	缓存友好，内存紧凑	聚集问题	负载因子较低时
二次探测	减少聚集	可能无法遍历所有位置	中等负载因子
双重哈希	分布均匀	计算开销大	高性能要求

Note

选择合适的哈希函数：根据数据特性选择
控制负载因子：链地址法<1.0，开放地址法<0.75
动态调整大小：及时扩容避免性能退化
考虑缓存局部性：开放地址法通常更好
安全性考虑：密码学应用使用加密哈希函数