Memcached源码分析之slabs.c

时间:2023-03-10 05:51:15
Memcached源码分析之slabs.c
  1. #include "memcached.h"
  2. #include <sys/stat.h>
  3. #include <sys/socket.h>
  4. #include <sys/signal.h>
  5. #include <sys/resource.h>
  6. #include <fcntl.h>
  7. #include <netinet/in.h>
  8. #include <errno.h>
  9. #include <stdlib.h>
  10. #include <stdio.h>
  11. #include <string.h>
  12. #include <assert.h>
  13. #include <pthread.h>
  14. typedef struct {
  15. unsigned int size;      /* sizes of items */     //item或者说chunk的大小
  16. unsigned int perslab;   /* how many items per slab */ //每个slab有多少个item,slab又称“页”
  17. /**
  18. 当前slabclass的空闲item链表,也是可用item链表,当前slabclass一切可以用的内存空间都在此,
  19. 这里是内存分配的入口,分配内存的时候都是在这个链表上挤一个出去。
  20. ps:memcached的新版本才开始把slots作为“所有空闲的item链接”的用途,以前的版本slots链表保存的是“回收的item”的意思,
  21. 而旧版本新分配的slab,是用end_page_ptr指针及end_page_free来控制,此版本已不用。
  22. */
  23. void *slots;           /* list of item ptrs */
  24. unsigned int sl_curr;   /* total free items in list */  //当前slabclass还剩多少空闲的item,即上面的slots数
  25. unsigned int slabs;     /* how many slabs were allocated for this class */ //这个slabclass分配了多少个slab了
  26. /**
  27. slab_list是这个slabclass下的slabs列表,逻辑上是一个数组,每个元素是一个slab指针。
  28. list_size是slab_list的元素个数。
  29. 注意这个list_size和上面的slabs的不同:
  30. 由于slab_list是一个空间大小固定的数组,是数组!而list_size是这个数组元素的个数,代表slab_list的空间大小。
  31. slabs代表已经分配出去的slabs数,list_size则代表可以有多少个slabs数
  32. 所以当slabs等于list_size的时候代表这个slab_list已经满了,得增大空间。
  33. */
  34. void **slab_list;       /* array of slab pointers */
  35. unsigned int list_size; /* size of prev array */
  36. unsigned int killing;  /* index+1 of dying slab, or zero if none */
  37. size_t requested; /* The number of requested bytes */
  38. } slabclass_t;
  39. static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];
  40. static size_t mem_limit = 0; //内存上限
  41. static size_t mem_malloced = 0; //已分配的内存
  42. static int power_largest;
  43. static void *mem_base = NULL; //预分配的内存空间
  44. static void *mem_current = NULL;
  45. static size_t mem_avail = 0;
  46. static pthread_mutex_t slabs_lock = PTHREAD_MUTEX_INITIALIZER;
  47. static pthread_mutex_t slabs_rebalance_lock = PTHREAD_MUTEX_INITIALIZER;
  48. static int do_slabs_newslab(const unsigned int id);
  49. static void *memory_allocate(size_t size);
  50. static void do_slabs_free(void *ptr, const size_t size, unsigned int id);
  51. static void slabs_preallocate (const unsigned int maxslabs);
  52. //根据item大小找到合适的slabclass
  53. unsigned int slabs_clsid(const size_t size) {
  54. int res = POWER_SMALLEST;
  55. if (size == 0)
  56. return 0;
  57. while (size > slabclass[res].size)
  58. if (res++ == power_largest)     /* won't fit in the biggest slab */
  59. return 0;
  60. return res;
  61. }
  62. /**
  63. 初始化slabs,这里会对一些内存管理进行初始化
  64. */
  65. void slabs_init(const size_t limit, const double factor, const bool prealloc) {
  66. int i = POWER_SMALLEST - 1;
  67. unsigned int size = sizeof(item) + settings.chunk_size;
  68. mem_limit = limit; //这个limit就是启动时候用户设置的-m xx中的xx,最大的内存上限
  69. if (prealloc) {
  70. /**
  71. 如果用户开启了预分配,则先把上限的内存先分配出来,放到mem_base全局变量中。
  72. 所以这个时候服务就拥有了一大坨内存,以后要分配的内存都是从这一坨里面割下来。
  73. */
  74. mem_base = malloc(mem_limit);
  75. if (mem_base != NULL) {
  76. mem_current = mem_base;
  77. mem_avail = mem_limit;
  78. } else {
  79. fprintf(stderr, "Warning: Failed to allocate requested memory in"
  80. " one large chunk.\nWill allocate in smaller chunks\n");
  81. }
  82. }
  83. //下面是初始化各个slabclass对象
  84. memset(slabclass, 0, sizeof(slabclass));
  85. while (++i < POWER_LARGEST && size <= settings.item_size_max / factor) {
  86. /* Make sure items are always n-byte aligned */
  87. if (size % CHUNK_ALIGN_BYTES)
  88. size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
  89. slabclass[i].size = size;
  90. slabclass[i].perslab = settings.item_size_max / slabclass[i].size;
  91. size *= factor;
  92. if (settings.verbose > 1) {
  93. fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
  94. i, slabclass[i].size, slabclass[i].perslab);
  95. }
  96. }
  97. power_largest = i;
  98. slabclass[power_largest].size = settings.item_size_max;
  99. slabclass[power_largest].perslab = 1;
  100. if (settings.verbose > 1) {
  101. fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
  102. i, slabclass[i].size, slabclass[i].perslab);
  103. }
  104. {
  105. char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
  106. if (t_initial_malloc) {
  107. mem_malloced = (size_t)atol(t_initial_malloc);
  108. }
  109. }
  110. if (prealloc) {
  111. slabs_preallocate(power_largest);
  112. }
  113. }
  114. /**
  115. 内存预分配,如果用户开启了预分配,则会调用此方法,先从mem_base为分每个slabclass割一个slab大小下来。
  116. */
  117. static void slabs_preallocate (const unsigned int maxslabs) {
  118. int i;
  119. unsigned int prealloc = 0;
  120. for (i = POWER_SMALLEST; i <= POWER_LARGEST; i++) {
  121. if (++prealloc > maxslabs)
  122. return;
  123. if (do_slabs_newslab(i) == 0) {
  124. fprintf(stderr, "Error while preallocating slab memory!\n"
  125. "If using -L or other prealloc options, max memory must be "
  126. "at least %d megabytes.\n", power_largest);
  127. exit(1);
  128. }
  129. }
  130. }
  131. static int grow_slab_list (const unsigned int id) {
  132. slabclass_t *p = &slabclass[id];
  133. /**
  134. p->slab_list是一个空间大小固定的数组,是数组!而list_size是这个数组分配的空间。
  135. p->slabs代表已经分配出去的slabs数
  136. 而p->list_size代表可以用多少个slabs数
  137. 所以当slabs等于list_size的时候代表这个slab_list已经满了,得增大空间。
  138. */
  139. if (p->slabs == p->list_size) {
  140. size_t new_size =  (p->list_size != 0) ? p->list_size * 2 : 16;
  141. void *new_list = realloc(p->slab_list, new_size * sizeof(void *)); //
  142. if (new_list == 0) return 0;
  143. p->list_size = new_size;
  144. p->slab_list = new_list;
  145. }
  146. return 1;
  147. }
  148. /**
  149. 把整个slab打散成一个个(也叫chunk)放到相应的slots链表中
  150. */
  151. static void split_slab_page_into_freelist(char *ptr, const unsigned int id) {
  152. slabclass_t *p = &slabclass[id];
  153. int x;
  154. for (x = 0; x < p->perslab; x++) {
  155. do_slabs_free(ptr, 0, id); //这个函数主要作用是让当前item空间可用,即加到slots链表中。
  156. ptr += p->size;
  157. }
  158. }
  159. /**
  160. 为slabclass[id]分配新的slab,仅当当前的slabclass中slots没有空闲的空间才调用
  161. 此函数分配新的slab
  162. */
  163. static int do_slabs_newslab(const unsigned int id) {
  164. slabclass_t *p = &slabclass[id];
  165. int len = settings.slab_reassign ? settings.item_size_max
  166. : p->size * p->perslab; //先判断是否开启了自定义slab大小,如果没有就按默认的,即约1M
  167. char *ptr;
  168. /**
  169. 下面if的逻辑是:
  170. 如果内存超出了限制,分配失败进入if,返回0
  171. 否则调用grow_slab_list检查是否要增大slab_list的大小
  172. 如果在grow_slab_list返回失败,则不继续分配空间,进入if,返回0
  173. 否则分配空间memory_allocate,如果分配失败,同样进入if,返回0;
  174. */
  175. if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0) ||
  176. (grow_slab_list(id) == 0) ||
  177. ((ptr = memory_allocate((size_t)len)) == 0)) {
  178. MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
  179. return 0;
  180. }
  181. memset(ptr, 0, (size_t)len); //清干净内存空间
  182. split_slab_page_into_freelist(ptr, id); //把新申请的slab放到slots中去
  183. p->slab_list[p->slabs++] = ptr; //把新的slab加到slab_list数组中
  184. mem_malloced += len; //记下已分配的空间大小
  185. MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);
  186. return 1;
  187. }
  188. /**
  189. 根据item大小和slabsclass分配空间
  190. */
  191. static void *do_slabs_alloc(const size_t size, unsigned int id) {
  192. slabclass_t *p;
  193. void *ret = NULL;
  194. item *it = NULL;
  195. if (id < POWER_SMALLEST || id > power_largest) { //默认最大是200,最小是1
  196. MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
  197. return NULL;
  198. }
  199. p = &slabclass[id]; //slabclass是一个全局变量,是各个slabclass对象数组,在这取得当前id对应的slabclass
  200. assert(p->sl_curr == 0 || ((item *)p->slots)->slabs_clsid == 0);
  201. /* fail unless we have space at the end of a recently allocated page,
  202. we have something on our freelist, or we could allocate a new page */
  203. /**
  204. 下面这个if的逻辑相当于:
  205. 如果p->sl_curr==0,即slots链表中没有空闲的空间,则do_slabs_newslab分配新slab
  206. 如果p->sl_curr==0,且do_slabs_newslab分配新slab失败,则进入if,ret = NULL,否则进入下面的elseif
  207. */
  208. if (! (p->sl_curr != 0 || do_slabs_newslab(id) != 0)) {
  209. /* We don't have more memory available */
  210. ret = NULL;
  211. } else if (p->sl_curr != 0) { //如果进入此分支是因为slots链表中还有空闲的空间
  212. /* return off our freelist */
  213. //把空闲的item分配出去
  214. it = (item *)p->slots;
  215. p->slots = it->next;
  216. if (it->next) it->next->prev = 0;
  217. p->sl_curr--;
  218. ret = (void *)it;
  219. }
  220. if (ret) {
  221. p->requested += size; //分配成功,记下已分配的字节数
  222. MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
  223. } else {
  224. MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
  225. }
  226. return ret;
  227. }
  228. /**
  229. 这个函数的命名虽然叫do_slabs_free,听上去好像是释放空间,其实质是把空间变成可用。
  230. 怎样的空间才算可用?就是加到当前slabclass的slots链表中而已。
  231. 所以新申请的slab也会调用这个函数,让整个slab变为可用。
  232. ps: 以前的memcached版本slots链表保存的是回收的item空间,而
  233. 现在保存的是所有可用的item空间。
  234. */
  235. static void do_slabs_free(void *ptr, const size_t size, unsigned int id) {
  236. slabclass_t *p;
  237. item *it;
  238. assert(((item *)ptr)->slabs_clsid == 0);
  239. assert(id >= POWER_SMALLEST && id <= power_largest);
  240. if (id < POWER_SMALLEST || id > power_largest)
  241. return;
  242. MEMCACHED_SLABS_FREE(size, id, ptr);
  243. p = &slabclass[id];
  244. it = (item *)ptr;
  245. it->it_flags |= ITEM_SLABBED; //把item标记为slabbed状态
  246. it->prev = 0;
  247. it->next = p->slots;  //插入到slots链表中
  248. if (it->next) it->next->prev = it;
  249. p->slots = it;
  250. p->sl_curr++; //空闲item数加1
  251. p->requested -= size;
  252. return;
  253. }
  254. static int nz_strcmp(int nzlength, const char *nz, const char *z) {
  255. int zlength=strlen(z);
  256. return (zlength == nzlength) && (strncmp(nz, z, zlength) == 0) ? 0 : -1;
  257. }
  258. bool get_stats(const char *stat_type, int nkey, ADD_STAT add_stats, void *c) {
  259. bool ret = true;
  260. if (add_stats != NULL) {
  261. if (!stat_type) {
  262. /* prepare general statistics for the engine */
  263. STATS_LOCK();
  264. APPEND_STAT("bytes", "%llu", (unsigned long long)stats.curr_bytes);
  265. APPEND_STAT("curr_items", "%u", stats.curr_items);
  266. APPEND_STAT("total_items", "%u", stats.total_items);
  267. STATS_UNLOCK();
  268. item_stats_totals(add_stats, c);
  269. } else if (nz_strcmp(nkey, stat_type, "items") == 0) {
  270. item_stats(add_stats, c);
  271. } else if (nz_strcmp(nkey, stat_type, "slabs") == 0) {
  272. slabs_stats(add_stats, c);
  273. } else if (nz_strcmp(nkey, stat_type, "sizes") == 0) {
  274. item_stats_sizes(add_stats, c);
  275. } else {
  276. ret = false;
  277. }
  278. } else {
  279. ret = false;
  280. }
  281. return ret;
  282. }
  283. static void do_slabs_stats(ADD_STAT add_stats, void *c) {
  284. int i, total;
  285. /* Get the per-thread stats which contain some interesting aggregates */
  286. struct thread_stats thread_stats;
  287. threadlocal_stats_aggregate(&thread_stats);
  288. total = 0;
  289. for(i = POWER_SMALLEST; i <= power_largest; i++) {
  290. slabclass_t *p = &slabclass[i];
  291. if (p->slabs != 0) {
  292. uint32_t perslab, slabs;
  293. slabs = p->slabs;
  294. perslab = p->perslab;
  295. char key_str[STAT_KEY_LEN];
  296. char val_str[STAT_VAL_LEN];
  297. int klen = 0, vlen = 0;
  298. APPEND_NUM_STAT(i, "chunk_size", "%u", p->size);
  299. APPEND_NUM_STAT(i, "chunks_per_page", "%u", perslab);
  300. APPEND_NUM_STAT(i, "total_pages", "%u", slabs);
  301. APPEND_NUM_STAT(i, "total_chunks", "%u", slabs * perslab);
  302. APPEND_NUM_STAT(i, "used_chunks", "%u",
  303. slabs*perslab - p->sl_curr);
  304. APPEND_NUM_STAT(i, "free_chunks", "%u", p->sl_curr);
  305. /* Stat is dead, but displaying zero instead of removing it. */
  306. APPEND_NUM_STAT(i, "free_chunks_end", "%u", 0);
  307. APPEND_NUM_STAT(i, "mem_requested", "%llu",
  308. (unsigned long long)p->requested);
  309. APPEND_NUM_STAT(i, "get_hits", "%llu",
  310. (unsigned long long)thread_stats.slab_stats[i].get_hits);
  311. APPEND_NUM_STAT(i, "cmd_set", "%llu",
  312. (unsigned long long)thread_stats.slab_stats[i].set_cmds);
  313. APPEND_NUM_STAT(i, "delete_hits", "%llu",
  314. (unsigned long long)thread_stats.slab_stats[i].delete_hits);
  315. APPEND_NUM_STAT(i, "incr_hits", "%llu",
  316. (unsigned long long)thread_stats.slab_stats[i].incr_hits);
  317. APPEND_NUM_STAT(i, "decr_hits", "%llu",
  318. (unsigned long long)thread_stats.slab_stats[i].decr_hits);
  319. APPEND_NUM_STAT(i, "cas_hits", "%llu",
  320. (unsigned long long)thread_stats.slab_stats[i].cas_hits);
  321. APPEND_NUM_STAT(i, "cas_badval", "%llu",
  322. (unsigned long long)thread_stats.slab_stats[i].cas_badval);
  323. APPEND_NUM_STAT(i, "touch_hits", "%llu",
  324. (unsigned long long)thread_stats.slab_stats[i].touch_hits);
  325. total++;
  326. }
  327. }
  328. APPEND_STAT("active_slabs", "%d", total);
  329. APPEND_STAT("total_malloced", "%llu", (unsigned long long)mem_malloced);
  330. add_stats(NULL, 0, NULL, 0, c);
  331. }
  332. /**
  333. 分配内存空间
  334. */
  335. static void *memory_allocate(size_t size) {
  336. void *ret;
  337. /**
  338. 有两种分配策略
  339. 1)如果是开启了内存预分配策略,则只需要从预分配好的内存块那里割一块出来。即进入下面的else分支
  340. 2)如果没有开启预分配,则malloc分配内存
  341. 关于预分配详见 slabs_init
  342. */
  343. if (mem_base == NULL) {
  344. /* We are not using a preallocated large memory chunk */
  345. ret = malloc(size);
  346. } else {
  347. ret = mem_current;
  348. if (size > mem_avail) {
  349. return NULL;
  350. }
  351. /* mem_current pointer _must_ be aligned!!! */
  352. if (size % CHUNK_ALIGN_BYTES) {
  353. size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
  354. }
  355. mem_current = ((char*)mem_current) + size;
  356. if (size < mem_avail) {
  357. mem_avail -= size;
  358. } else {
  359. mem_avail = 0;
  360. }
  361. }
  362. return ret;
  363. }
  364. void *slabs_alloc(size_t size, unsigned int id) {
  365. void *ret;
  366. pthread_mutex_lock(&slabs_lock);
  367. ret = do_slabs_alloc(size, id);
  368. pthread_mutex_unlock(&slabs_lock);
  369. return ret;
  370. }
  371. void slabs_free(void *ptr, size_t size, unsigned int id) {
  372. pthread_mutex_lock(&slabs_lock);
  373. do_slabs_free(ptr, size, id);
  374. pthread_mutex_unlock(&slabs_lock);
  375. }
  376. void slabs_stats(ADD_STAT add_stats, void *c) {
  377. pthread_mutex_lock(&slabs_lock);
  378. do_slabs_stats(add_stats, c);
  379. pthread_mutex_unlock(&slabs_lock);
  380. }
  381. void slabs_adjust_mem_requested(unsigned int id, size_t old, size_t ntotal)
  382. {
  383. pthread_mutex_lock(&slabs_lock);
  384. slabclass_t *p;
  385. if (id < POWER_SMALLEST || id > power_largest) {
  386. fprintf(stderr, "Internal error! Invalid slab class\n");
  387. abort();
  388. }
  389. p = &slabclass[id];
  390. p->requested = p->requested - old + ntotal;
  391. pthread_mutex_unlock(&slabs_lock);
  392. }
  393. static pthread_cond_t maintenance_cond = PTHREAD_COND_INITIALIZER;
  394. static pthread_cond_t slab_rebalance_cond = PTHREAD_COND_INITIALIZER;
  395. static volatile int do_run_slab_thread = 1;
  396. static volatile int do_run_slab_rebalance_thread = 1;
  397. #define DEFAULT_SLAB_BULK_CHECK 1
  398. int slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
  399. static int slab_rebalance_start(void) {
  400. slabclass_t *s_cls;
  401. int no_go = 0;
  402. pthread_mutex_lock(&cache_lock);
  403. pthread_mutex_lock(&slabs_lock);
  404. if (slab_rebal.s_clsid < POWER_SMALLEST ||
  405. slab_rebal.s_clsid > power_largest  ||
  406. slab_rebal.d_clsid < POWER_SMALLEST ||
  407. slab_rebal.d_clsid > power_largest  ||
  408. slab_rebal.s_clsid == slab_rebal.d_clsid)
  409. no_go = -2;
  410. s_cls = &slabclass[slab_rebal.s_clsid];
  411. if (!grow_slab_list(slab_rebal.d_clsid)) {
  412. no_go = -1;
  413. }
  414. if (s_cls->slabs < 2)
  415. no_go = -3;
  416. if (no_go != 0) {
  417. pthread_mutex_unlock(&slabs_lock);
  418. pthread_mutex_unlock(&cache_lock);
  419. return no_go; /* Should use a wrapper function... */
  420. }
  421. s_cls->killing = 1;
  422. slab_rebal.slab_start = s_cls->slab_list[s_cls->killing - 1];
  423. slab_rebal.slab_end   = (char *)slab_rebal.slab_start +
  424. (s_cls->size * s_cls->perslab);
  425. slab_rebal.slab_pos   = slab_rebal.slab_start;
  426. slab_rebal.done       = 0;
  427. /* Also tells do_item_get to search for items in this slab */
  428. slab_rebalance_signal = 2;
  429. if (settings.verbose > 1) {
  430. fprintf(stderr, "Started a slab rebalance\n");
  431. }
  432. pthread_mutex_unlock(&slabs_lock);
  433. pthread_mutex_unlock(&cache_lock);
  434. STATS_LOCK();
  435. stats.slab_reassign_running = true;
  436. STATS_UNLOCK();
  437. return 0;
  438. }
  439. enum move_status {
  440. MOVE_PASS=0, MOVE_DONE, MOVE_BUSY, MOVE_LOCKED
  441. };
  442. static int slab_rebalance_move(void) {
  443. slabclass_t *s_cls;
  444. int x;
  445. int was_busy = 0;
  446. int refcount = 0;
  447. enum move_status status = MOVE_PASS;
  448. pthread_mutex_lock(&cache_lock);
  449. pthread_mutex_lock(&slabs_lock);
  450. s_cls = &slabclass[slab_rebal.s_clsid];
  451. for (x = 0; x < slab_bulk_check; x++) {
  452. item *it = slab_rebal.slab_pos;
  453. status = MOVE_PASS;
  454. if (it->slabs_clsid != 255) {
  455. void *hold_lock = NULL;
  456. uint32_t hv = hash(ITEM_key(it), it->nkey);
  457. if ((hold_lock = item_trylock(hv)) == NULL) {
  458. status = MOVE_LOCKED;
  459. } else {
  460. refcount = refcount_incr(&it->refcount);
  461. if (refcount == 1) { /* item is unlinked, unused */
  462. if (it->it_flags & ITEM_SLABBED) {
  463. /* remove from slab freelist */
  464. if (s_cls->slots == it) {
  465. s_cls->slots = it->next;
  466. }
  467. if (it->next) it->next->prev = it->prev;
  468. if (it->prev) it->prev->next = it->next;
  469. s_cls->sl_curr--;
  470. status = MOVE_DONE;
  471. } else {
  472. status = MOVE_BUSY;
  473. }
  474. } else if (refcount == 2) { /* item is linked but not busy */
  475. if ((it->it_flags & ITEM_LINKED) != 0) {
  476. do_item_unlink_nolock(it, hv);
  477. status = MOVE_DONE;
  478. } else {
  479. /* refcount == 1 + !ITEM_LINKED means the item is being
  480. * uploaded to, or was just unlinked but hasn't been freed
  481. * yet. Let it bleed off on its own and try again later */
  482. status = MOVE_BUSY;
  483. }
  484. } else {
  485. if (settings.verbose > 2) {
  486. fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n",
  487. it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid);
  488. }
  489. status = MOVE_BUSY;
  490. }
  491. item_trylock_unlock(hold_lock);
  492. }
  493. }
  494. switch (status) {
  495. case MOVE_DONE:
  496. it->refcount = 0;
  497. it->it_flags = 0;
  498. it->slabs_clsid = 255;
  499. break;
  500. case MOVE_BUSY:
  501. refcount_decr(&it->refcount);
  502. case MOVE_LOCKED:
  503. slab_rebal.busy_items++;
  504. was_busy++;
  505. break;
  506. case MOVE_PASS:
  507. break;
  508. }
  509. slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
  510. if (slab_rebal.slab_pos >= slab_rebal.slab_end)
  511. break;
  512. }
  513. if (slab_rebal.slab_pos >= slab_rebal.slab_end) {
  514. /* Some items were busy, start again from the top */
  515. if (slab_rebal.busy_items) {
  516. slab_rebal.slab_pos = slab_rebal.slab_start;
  517. slab_rebal.busy_items = 0;
  518. } else {
  519. slab_rebal.done++;
  520. }
  521. }
  522. pthread_mutex_unlock(&slabs_lock);
  523. pthread_mutex_unlock(&cache_lock);
  524. return was_busy;
  525. }
  526. static void slab_rebalance_finish(void) {
  527. slabclass_t *s_cls;
  528. slabclass_t *d_cls;
  529. pthread_mutex_lock(&cache_lock);
  530. pthread_mutex_lock(&slabs_lock);
  531. s_cls = &slabclass[slab_rebal.s_clsid];
  532. d_cls   = &slabclass[slab_rebal.d_clsid];
  533. /* At this point the stolen slab is completely clear */
  534. s_cls->slab_list[s_cls->killing - 1] =
  535. s_cls->slab_list[s_cls->slabs - 1];
  536. s_cls->slabs--;
  537. s_cls->killing = 0;
  538. memset(slab_rebal.slab_start, 0, (size_t)settings.item_size_max);
  539. d_cls->slab_list[d_cls->slabs++] = slab_rebal.slab_start;
  540. split_slab_page_into_freelist(slab_rebal.slab_start,
  541. slab_rebal.d_clsid);
  542. slab_rebal.done       = 0;
  543. slab_rebal.s_clsid    = 0;
  544. slab_rebal.d_clsid    = 0;
  545. slab_rebal.slab_start = NULL;
  546. slab_rebal.slab_end   = NULL;
  547. slab_rebal.slab_pos   = NULL;
  548. slab_rebalance_signal = 0;
  549. pthread_mutex_unlock(&slabs_lock);
  550. pthread_mutex_unlock(&cache_lock);
  551. STATS_LOCK();
  552. stats.slab_reassign_running = false;
  553. stats.slabs_moved++;
  554. STATS_UNLOCK();
  555. if (settings.verbose > 1) {
  556. fprintf(stderr, "finished a slab move\n");
  557. }
  558. }
  559. /*
  560. slab自动重分配时,执行此函数做出重分配方案决定
  561. */
  562. static int slab_automove_decision(int *src, int *dst) {
  563. static uint64_t evicted_old[POWER_LARGEST];
  564. static unsigned int slab_zeroes[POWER_LARGEST];
  565. static unsigned int slab_winner = 0;
  566. static unsigned int slab_wins   = 0;
  567. uint64_t evicted_new[POWER_LARGEST];
  568. uint64_t evicted_diff = 0;
  569. uint64_t evicted_max  = 0;
  570. unsigned int highest_slab = 0;
  571. unsigned int total_pages[POWER_LARGEST];
  572. int i;
  573. int source = 0;
  574. int dest = 0;
  575. static rel_time_t next_run;
  576. /* Run less frequently than the slabmove tester. */
  577. if (current_time >= next_run) {
  578. next_run = current_time + 10;
  579. } else {
  580. return 0;
  581. }
  582. item_stats_evictions(evicted_new);
  583. pthread_mutex_lock(&cache_lock);
  584. for (i = POWER_SMALLEST; i < power_largest; i++) {
  585. total_pages[i] = slabclass[i].slabs;
  586. }
  587. pthread_mutex_unlock(&cache_lock);
  588. /* Find a candidate source; something with zero evicts 3+ times */
  589. for (i = POWER_SMALLEST; i < power_largest; i++) {
  590. evicted_diff = evicted_new[i] - evicted_old[i];
  591. if (evicted_diff == 0 && total_pages[i] > 2) {
  592. slab_zeroes[i]++;
  593. if (source == 0 && slab_zeroes[i] >= 3)
  594. source = i;
  595. } else {
  596. slab_zeroes[i] = 0;
  597. if (evicted_diff > evicted_max) {
  598. evicted_max = evicted_diff;
  599. highest_slab = i;
  600. }
  601. }
  602. evicted_old[i] = evicted_new[i];
  603. }
  604. /* Pick a valid destination */
  605. if (slab_winner != 0 && slab_winner == highest_slab) {
  606. slab_wins++;
  607. if (slab_wins >= 3)
  608. dest = slab_winner;
  609. } else {
  610. slab_wins = 1;
  611. slab_winner = highest_slab;
  612. }
  613. if (source && dest) {
  614. *src = source;
  615. *dst = dest;
  616. return 1;
  617. }
  618. return 0;
  619. }
  620. /* Slab rebalancer thread.
  621. * Does not use spinlocks since it is not timing sensitive. Burn less CPU and
  622. * go to sleep if locks are contended
  623. 运行slab维护线程,slab维护线程的执行入口
  624. */
  625. static void *slab_maintenance_thread(void *arg) {
  626. int src, dest;
  627. while (do_run_slab_thread) {
  628. if (settings.slab_automove == 1) {
  629. if (slab_automove_decision(&src, &dest) == 1) {
  630. /* Blind to the return codes. It will retry on its own */
  631. slabs_reassign(src, dest); //移动slab,重分配
  632. }
  633. sleep(1);
  634. } else {
  635. /* Don't wake as often if we're not enabled.
  636. * This is lazier than setting up a condition right now. */
  637. sleep(5);
  638. }
  639. }
  640. return NULL;
  641. }
  642. /* Slab mover thread.
  643. * Sits waiting for a condition to jump off and shovel some memory about
  644. */
  645. static void *slab_rebalance_thread(void *arg) {
  646. int was_busy = 0;
  647. /* So we first pass into cond_wait with the mutex held */
  648. mutex_lock(&slabs_rebalance_lock);
  649. while (do_run_slab_rebalance_thread) {
  650. if (slab_rebalance_signal == 1) {
  651. if (slab_rebalance_start() < 0) {
  652. /* Handle errors with more specifity as required. */
  653. slab_rebalance_signal = 0;
  654. }
  655. was_busy = 0;
  656. } else if (slab_rebalance_signal && slab_rebal.slab_start != NULL) {
  657. was_busy = slab_rebalance_move();
  658. }
  659. if (slab_rebal.done) {
  660. slab_rebalance_finish();
  661. } else if (was_busy) {
  662. /* Stuck waiting for some items to unlock, so slow down a bit
  663. * to give them a chance to free up */
  664. usleep(50);
  665. }
  666. if (slab_rebalance_signal == 0) {
  667. /* always hold this lock while we're running */
  668. pthread_cond_wait(&slab_rebalance_cond, &slabs_rebalance_lock);
  669. }
  670. }
  671. return NULL;
  672. }
  673. static int slabs_reassign_pick_any(int dst) {
  674. static int cur = POWER_SMALLEST - 1;
  675. int tries = power_largest - POWER_SMALLEST + 1;
  676. for (; tries > 0; tries--) {
  677. cur++;
  678. if (cur > power_largest)
  679. cur = POWER_SMALLEST;
  680. if (cur == dst)
  681. continue;
  682. if (slabclass[cur].slabs > 1) {
  683. return cur;
  684. }
  685. }
  686. return -1;
  687. }
  688. static enum reassign_result_type do_slabs_reassign(int src, int dst) {
  689. if (slab_rebalance_signal != 0)
  690. return REASSIGN_RUNNING;
  691. if (src == dst)
  692. return REASSIGN_SRC_DST_SAME;
  693. /* Special indicator to choose ourselves. */
  694. if (src == -1) {
  695. src = slabs_reassign_pick_any(dst);
  696. /* TODO: If we end up back at -1, return a new error type */
  697. }
  698. if (src < POWER_SMALLEST || src > power_largest ||
  699. dst < POWER_SMALLEST || dst > power_largest)
  700. return REASSIGN_BADCLASS;
  701. if (slabclass[src].slabs < 2)
  702. return REASSIGN_NOSPARE;
  703. slab_rebal.s_clsid = src;
  704. slab_rebal.d_clsid = dst;
  705. slab_rebalance_signal = 1;
  706. pthread_cond_signal(&slab_rebalance_cond);
  707. return REASSIGN_OK;
  708. }
  709. enum reassign_result_type slabs_reassign(int src, int dst) {
  710. enum reassign_result_type ret;
  711. if (pthread_mutex_trylock(&slabs_rebalance_lock) != 0) {
  712. return REASSIGN_RUNNING;
  713. }
  714. ret = do_slabs_reassign(src, dst);
  715. pthread_mutex_unlock(&slabs_rebalance_lock);
  716. return ret;
  717. }
  718. /* If we hold this lock, rebalancer can't wake up or move */
  719. void slabs_rebalancer_pause(void) {
  720. pthread_mutex_lock(&slabs_rebalance_lock);
  721. }
  722. void slabs_rebalancer_resume(void) {
  723. pthread_mutex_unlock(&slabs_rebalance_lock);
  724. }
  725. static pthread_t maintenance_tid;
  726. static pthread_t rebalance_tid;
  727. /**
  728. 启动slab维护线程
  729. */
  730. int start_slab_maintenance_thread(void) {
  731. int ret;
  732. slab_rebalance_signal = 0;
  733. slab_rebal.slab_start = NULL;
  734. char *env = getenv("MEMCACHED_SLAB_BULK_CHECK");
  735. if (env != NULL) {
  736. slab_bulk_check = atoi(env);
  737. if (slab_bulk_check == 0) {
  738. slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
  739. }
  740. }
  741. if (pthread_cond_init(&slab_rebalance_cond, NULL) != 0) {
  742. fprintf(stderr, "Can't intiialize rebalance condition\n");
  743. return -1;
  744. }
  745. pthread_mutex_init(&slabs_rebalance_lock, NULL);
  746. if ((ret = pthread_create(&maintenance_tid, NULL,
  747. slab_maintenance_thread, NULL)) != 0) {
  748. fprintf(stderr, "Can't create slab maint thread: %s\n", strerror(ret));
  749. return -1;
  750. }
  751. if ((ret = pthread_create(&rebalance_tid, NULL,
  752. slab_rebalance_thread, NULL)) != 0) {
  753. fprintf(stderr, "Can't create rebal thread: %s\n", strerror(ret));
  754. return -1;
  755. }
  756. return 0;
  757. }
  758. /**
  759. 停止slab维护线程,逻辑和停止哈希表维护线程一样。
  760. */
  761. void stop_slab_maintenance_thread(void) {
  762. mutex_lock(&cache_lock);
  763. do_run_slab_thread = 0;
  764. do_run_slab_rebalance_thread = 0;
  765. pthread_cond_signal(&maintenance_cond);
  766. pthread_mutex_unlock(&cache_lock);
  767. /* Wait for the maintenance thread to stop */
  768. pthread_join(maintenance_tid, NULL);
  769. pthread_join(rebalance_tid, NULL);
  770. }