Redis源码研究--字典

计划每天花1小时学习Redis 源码。在博客上做个记录。

--------6月18日-----------

redis的字典dict主要涉及几个数据结构，

dictEntry：具体的k-v链表结点

dictht：哈希表

dict：字典

具体关系为

Redis源码研究--字典

 typedef struct dict {

     dictType *type;

     void *privdata;

     dictht ht[];

     int rehashidx; /* rehashing not in progress if rehashidx == -1 */

     int iterators; /* number of iterators currently running */

 } dict;

 typedef struct dictht {

     dictEntry **table;

     unsigned long size;

     unsigned long sizemask;

     unsigned long used;

 } dictht;

 typedef struct dictEntry {

     void *key;

     union {

         void *val;

         uint64_t u64;

         int64_t s64;

     } v;

     struct dictEntry *next;

 } dictEntry;

一个字典有两个哈希表，冲突后采用了链地址法，很好理解。

一些简单操作采用了宏

#define dictGetKey(he) ((he)->key)

#define dictGetVal(he) ((he)->v.val)

#define dictGetSignedIntegerVal(he) ((he)->v.s64)

#define dictGetUnsignedIntegerVal(he) ((he)->v.u64)

------------6月19日----------------------

字典具体用到了两种哈希算法，我只看了简单的那一种，没想到代码竟然可以那么少，算法名字为djb2，

 /* And a case insensitive hash function (based on djb hash) */

 unsigned int dictGenCaseHashFunction(const unsigned char *buf, int len) {

     unsigned int hash = (unsigned int)dict_hash_function_seed;

     while (len--)

         hash = ((hash << ) + hash) + (tolower(*buf++)); /* hash * 33 + c */

     return hash;

 }

dict_hash_function_seed是个全局变量，为5381.
The magic of number 33 (why it works better than many other constants, prime or not) has never been adequately explained.
JDK中采用的哈希算法取得数字是31，一个素数。
创建一个新字典并初始化：

 dict *dictCreate(dictType *type, void *privDataPtr){

     dict *d = malloc(sizeof(*d));

     _dictInit(d,type,privDataPtr);

     return d;

 }

 int _dictInit(dict *d, dictType *type, void *privDataPtr){

     _dictReset(&d->ht[]);

     _dictReset(&d->ht[]);

     d->type = type;

     d->privdata = privDataPtr;

     d->rehashidx = -;

     d->iterators = ;

     return DICT_OK;

 }

 static void _dictReset(dictht *ht){

     ht->table = NULL;

     ht->size = ;

     ht->sizemask = ;

     ht->used = ;

 }

学了这么多年c语言了，malloc(sizeof(*d))我还是第一次看到。
说到sizeof，我还要提一句，c99之后，sizeof是运行时确定的，c99还加入了动态数组这一概念。csdn上的回答是错的。
对字典进行紧缩处理，让 哈希表中的数/哈希表长度接近1：

 int dictResize(dict *d){

     int minimal;

     if (!dict_can_resize || dictIsRehashing(d)) return DICT_ERR;

     minimal = d->ht[].used;

     if (minimal < DICT_HT_INITIAL_SIZE)

         minimal = DICT_HT_INITIAL_SIZE;

     return dictExpand(d, minimal);

 }

 #define dictIsRehashing(ht) ((ht)->rehashidx != -1)

 #define DICT_HT_INITIAL_SIZE     4

当字典正在Rehash的时候不能进行Resize操作，初始时哈希表大小为4，哈希表大小一般都是2的幂次方。
如果minimal是5，经过dictExpand后，哈希表大小变为8.

 static unsigned long _dictNextPower(unsigned long size){

     unsigned long i = DICT_HT_INITIAL_SIZE;

     if (size >= LONG_MAX) return LONG_MAX;

     while() {

         if (i >= size)

             return i;

         i *= ;

     }

 }

 int dictExpand(dict *d, unsigned long size){

     dictht n; /* the new hash table */

     unsigned long realsize = _dictNextPower(size);

     /* the size is invalid if it is smaller than the number of

      * elements already inside the hash table */

     if (dictIsRehashing(d) || d->ht[].used > size)

         return DICT_ERR;

     /* Allocate the new hash table and initialize all pointers to NULL */

     n.size = realsize;

     n.sizemask = realsize-;

     n.table = zcalloc(realsize*sizeof(dictEntry*));

     n.used = ;

     /* Is this the first initialization? If so it's not really a rehashing

      * we just set the first hash table so that it can accept keys. */

     if (d->ht[].table == NULL) {

         d->ht[] = n;

         return DICT_OK;

     }

     /* Prepare a second hash table for incremental rehashing */

     d->ht[] = n;

     d->rehashidx = ;

     return DICT_OK;

 }

新建了一个哈希表n，size是扩展后的size，ht[0].table 为空说明这是第一次初始化，不是扩展，直接赋值。
ht[0].table 不为空，说明这是一次扩展，把n赋给ht[1]，ReHash标志rehashix也被设为0.
上边这段不大好理解，先看后面的，一会返过来再研究dictExpand函数。
--------------------6月20日--------------------------

向字典中添加元素需要调用dictAdd函数：

 /* Add an element to the target hash table */

 int dictAdd(dict *d, void *key, void *val){

     dictEntry *entry = dictAddRaw(d,key);

     if (!entry) return DICT_ERR;

     dictSetVal(d, entry, val);

     return DICT_OK;

 }

具体实现需要看dictAddRaw函数：

 dictEntry *dictAddRaw(dict *d, void *key){

     int index;

     dictEntry *entry;

     dictht *ht;

     if (dictIsRehashing(d)) _dictRehashStep(d);

     /* Get the index of the new element, or -1 if

      * the element already exists. */

     if ((index = _dictKeyIndex(d, key)) == -)

         return NULL;

     /* Allocate the memory and store the new entry */

     ht = dictIsRehashing(d) ? &d->ht[] : &d->ht[];

     entry = zmalloc(sizeof(*entry));

     entry->next = ht->table[index];

     ht->table[index] = entry;

     ht->used++;

     /* Set the hash entry fields. */

     dictSetKey(d, entry, key);

     return entry;

 }

先判断是不是在进行Rehash，如果在Rehash，执行渐进式Rehash。
找到要插入的key的位置，如果相同的key已经存在了，返回NULL
如果在进行Rehash，ht指向ht[1]表，然后利用链表头插法（这个我熟）将entry插入，更新used。
添加key前需要查找key的位置：

 /* Returns the index of a free slot that can be populated with

  * an hash entry for the given 'key'.

  * If the key already exists, -1 is returned.

  *

  * Note that if we are in the process of rehashing the hash table, the

  * index is always returned in the context of the second (new) hash table. */

 static int _dictKeyIndex(dict *d, const void *key){

     unsigned int h, idx, table;

     dictEntry *he;

     /* Expand the hash table if needed */

     if (_dictExpandIfNeeded(d) == DICT_ERR)

         return -;

     /* Compute the key hash value */

     h = dictHashKey(d, key);

     for (table = ; table <= ; table++) {

         idx = h & d->ht[table].sizemask;

         /* Search if this slot does not already contain the given key */

         he = d->ht[table].table[idx];

         while(he) {

             if (dictCompareKeys(d, key, he->key))

                 return -;

             he = he->next;

         }

         if (!dictIsRehashing(d)) break;

     }

     return idx;

 }

插入之前，程序会检查一下哈希表空间是否够，需不需要expand。通过某种哈希算法计算key对应的哈希值h，sizemask二进制格式大体是这样的011111111，哈希值跟它一与，相当于只保留了后面几位。算出来的idx就是要插入的索引号。然后需要比较在这个索引上的链表中有没有跟要插入的key一样的，如果重复了，返回-1.

最后判断下当前如果没有在进行Rehash，ht[2]表就不用管了。

-----------------------6月21日---------------------

 /* Expand the hash table if needed */

 static int _dictExpandIfNeeded(dict *d){

     /* Incremental rehashing already in progress. Return. */

     if (dictIsRehashing(d)) return DICT_OK;

     /* If the hash table is empty expand it to the initial size. */

     if (d->ht[].size == ) return dictExpand(d, DICT_HT_INITIAL_SIZE);

     /* If we reached the 1:1 ratio, and we are allowed to resize the hash

      * table (global setting) or we should avoid it but the ratio between

      * elements/buckets is over the "safe" threshold, we resize doubling

      * the number of buckets. */

     if (d->ht[].used >= d->ht[].size &&

         (dict_can_resize ||

          d->ht[].used/d->ht[].size > dict_force_resize_ratio))

     {

         return dictExpand(d, d->ht[].used*);

     }

     return DICT_OK;

 }

函数名前面带下划线的都表示这是private的。程序第4行又是先判断是否正在进行Rehash，

为什么要说又呢

如果哈希表是空的，那么我们扩展到DICT_HT_INITIAL_SIZE（4）个。

第13行有点不理解，used什么时候会大于size啊？？？？标记一下，以后再看。

dict_can_resize是个全局变量。dict_force_resize_ratio = 5.

/* Using dictEnableResize() / dictDisableResize() we make possible to

* enable/disable resizing of the hash table as needed. This is very important

* for Redis, as we use copy-on-write and don't want to move too much memory

* around when there is a child performing saving operations.

*

* Note that even when dict_can_resize is set to 0, not all resizes are

* prevented: an hash table is still allowed to grow if the ratio between

* the number of elements and the buckets > dict_force_resize_ratio. */

 void dictEnableResize(void) {

     dict_can_resize = ;

 }

 void dictDisableResize(void) {

     dict_can_resize = ;

 }

字典的 rehash 操作实际上就是执行以下任务：

创建一个比 ht[0]->table 更大的 ht[1]->table ；

将 ht[0]->table 中的所有键值对迁移到 ht[1]->table ；

将原有 ht[0] 的数据清空，并将 ht[1] 替换为新的 ht[0] ；

经过以上步骤之后，程序就在不改变原有键值对数据的基础上，增大了哈希表的大小。

--------------6月22日---------------------------

先上Rehash的代码

 int dictRehash(dict *d, int n) {

     if (!dictIsRehashing(d)) return ;

     while(n--) {

         dictEntry *de, *nextde;

         /* Check if we already rehashed the whole table... */

         if (d->ht[].used == ) {

             zfree(d->ht[].table);

             d->ht[] = d->ht[];

             _dictReset(&d->ht[]);

             d->rehashidx = -;

             return ;

         }

         /* Note that rehashidx can't overflow as we are sure there are more

          * elements because ht[0].used != 0 */

         assert(d->ht[].size > (unsigned)d->rehashidx);

         while(d->ht[].table[d->rehashidx] == NULL) d->rehashidx++;

         de = d->ht[].table[d->rehashidx];

         /* Move all the keys in this bucket from the old to the new hash HT */

         while(de) {

             unsigned int h;

             nextde = de->next;

             /* Get the index in the new hash table */

             h = dictHashKey(d, de->key) & d->ht[].sizemask;

             de->next = d->ht[].table[h];

             d->ht[].table[h] = de;

             d->ht[].used--;

             d->ht[].used++;

             de = nextde;

         }

         d->ht[].table[d->rehashidx] = NULL;

         d->rehashidx++;

     }

     return ;

 }

n步Rehash，在ht[0]中找到第一个不为空的table[rehashidx]，将这个位置的链表（可能只有一个元素）全部移到ht[1]中，并更新ht[0].used、ht[1].used。

执行过程中，ht[0]中的元素如果都已经转到了ht[1]中，即ht[0].used == 0，停止执行，释放ht[0].table指向的空间，ht[1]变为ht[0]，将rehashidx置为-1。

字典还剩一小部分，大体意思我弄懂了，加上之前看的动态字符串sds、双向链表adlist，加上空格注释统计了下共2248行。

   adlist.c

    adlist.h

   dict.c

   dict.h

   sds.c

    sds.h

  total

主要参考了《Redis 设计与实现》。谢谢90后作者了。

秒客网

Redis源码研究--字典

相关文章