《GPU高性能编程CUDA实战》附录二散列表

▶ 使用CPU和GPU分别实现散列表

● CPU方法

 #include <stdio.h>

 #include <time.h>

 #include "cuda_runtime.h"

 #include "D:\Code\CUDA\book\common\book.h"

 #define SIZE            (100*1024*1024)

 #define ELEMENTS        (SIZE / sizeof(unsigned int))

 #define HASH_ENTRIES    (1024)

 struct Entry

 {

     unsigned int    key;

     void            *value;

     Entry           *next;

 };

 struct Table

 {

     size_t  count;

     Entry   **entries;

     Entry   *pool;

     Entry   *firstFree;

 };

 size_t hash(unsigned int key, size_t count)

 {

     return key % count;

 }

 void initialize_table(Table &table, int entries, int elements)

 {

     table.count = entries;

     table.entries = (Entry**)calloc(entries, sizeof(Entry*));

     table.pool = (Entry*)malloc(elements * sizeof(Entry));

     table.firstFree = table.pool;

 }

 void free_table(Table &table)

 {

     free(table.entries);

     free(table.pool);

 }

 void add_to_table(Table &table, unsigned int key, void *value)

 {

     size_t hashValue = hash(key, table.count);

     Entry *location = table.firstFree++;

     location->key = key;

     location->value = value;

     location->next = table.entries[hashValue];// 插到该分支的头部而不是尾部

     table.entries[hashValue] = location;

 }

 void verify_table(const Table &table)

 {

     int count = ;

     for (size_t i = ; i<table.count; i++)

     {

         Entry   *current = table.entries[i];

         while (current != NULL)

         {

             ++count;

             if (hash(current->key, table.count) != i)

                 printf("\n\t%d hashed to %ld, but was located at %ld\n", current->key, hash(current->key, table.count), i);

             current = current->next;

         }

     }

     if (count != ELEMENTS)

         printf("\n\t%d elements found in hash table.  Should be %ld\n",

             count, ELEMENTS);

     else

         printf("\n\tAll %d elements found in hash table.\n", count);

 }

 int main(void)

 {

     unsigned int *buffer =(unsigned int*)big_random_block(SIZE);

     Table table;

     clock_t start, stop;

     initialize_table(table, HASH_ENTRIES, ELEMENTS);

     start = clock();

     for (int i = ; i<ELEMENTS; i++)

         add_to_table(table, buffer[i], (void*)NULL);

     stop = clock();

     printf("\n\tBuilding the table:  %3.1f ms\n", (float)(stop - start) / (float)CLOCKS_PER_SEC * 1000.0f);

     verify_table(table);

     free_table(table);

     free(buffer);

     getchar();

     return ;

 }

● GPU方法（用到了前面的原子锁）

 #include <stdio.h>

 #include <time.h>

 #include "cuda_runtime.h"

 #include "device_launch_parameters.h"

 #include "cuda.h"

 #include "D:\Code\CUDA\book\common\book.h"

 #define SIZE            (100*1024*1024)

 #define ELEMENTS        (SIZE / sizeof(unsigned int))

 #define HASH_ENTRIES    (1024)

 struct Lock

 {

     int *mutex;

     Lock(void)

     {

         int state = ;

         cudaMalloc((void **)&mutex, sizeof(int));

         cudaMemcpy(mutex, &state, sizeof(int), cudaMemcpyHostToDevice);

     }

     ~Lock(void)

     {

         cudaFree(mutex);

     }

     __device__ void lock(void)

     {

         while (atomicCAS(mutex, , ) != );

     }

     __device__ void unlock(void)

     {

         atomicExch(mutex, );

     }

 };

 struct Entry

 {

     unsigned int    key;

     void            *value;

     Entry           *next;

 };

 struct Table

 {

     size_t  count;

     Entry   **entries;

     Entry   *pool;

     Entry   *firstFree;

 };

 __device__ __host__ size_t hash(unsigned int key, size_t count)

 {

     return key % count;

 }

 void initialize_table(Table &table, int entries, int elements)

 {

     table.count = entries;

     cudaMalloc((void**)&table.entries, entries * sizeof(Entry*));

     cudaMemset(table.entries, , entries * sizeof(Entry*));

     cudaMalloc((void**)&table.pool, elements * sizeof(Entry));

 }

 void free_table(Table &table)

 {

     cudaFree(table.entries);

     cudaFree(table.pool);

 }

 __global__ void add_to_table(unsigned int *keys, void **values, Table table, Lock *lock)

 // 锁数组用于锁定散列表中的每一个桶

 {

     int tid = threadIdx.x + blockIdx.x * blockDim.x;

     int stride = blockDim.x * gridDim.x;

     while (tid < ELEMENTS)

     {

         unsigned int key = keys[tid];

         size_t hashValue = hash(key, table.count);

         for (int i = ; i<; i++)// 利用循环来分散线程束，使同一线程束中的32个线程在循环的不同次数时进行写入

         {

             if ((tid % ) == i)

             {

                 Entry *location = &(table.pool[tid]);

                 location->key = key;

                 location->value = values[tid];

                 lock[hashValue].lock();

                 location->next = table.entries[hashValue];

                 table.entries[hashValue] = location;

                 lock[hashValue].unlock();

             }

         }

         tid += stride;

     }

 }

 void copy_table_to_host(const Table &table, Table &hostTable)

 {

     hostTable.count = table.count;

     hostTable.entries = (Entry**)calloc(table.count, sizeof(Entry*));

     hostTable.pool = (Entry*)malloc(ELEMENTS * sizeof(Entry));

     cudaMemcpy(hostTable.entries, table.entries, table.count * sizeof(Entry*), cudaMemcpyDeviceToHost);

     cudaMemcpy(hostTable.pool, table.pool, ELEMENTS * sizeof(Entry), cudaMemcpyDeviceToHost);

     for (int i = ; i < table.count; i++)

     {

         if (hostTable.entries[i] != NULL)

             hostTable.entries[i] = (Entry*)((size_t)hostTable.entries[i] - (size_t)table.pool + (size_t)hostTable.pool);

         // 从从显存到内存的地址线性偏移 x - adressGPU + addressCPU

     }

     for (int i = ; i < ELEMENTS; i++)

     {

         if (hostTable.pool[i].next != NULL)

             hostTable.pool[i].next = (Entry*)((size_t)hostTable.pool[i].next - (size_t)table.pool + (size_t)hostTable.pool);

         // 同样是做偏移，但是要找到下一个元素的地址

     }

 }

 void verify_table(const Table &dev_table)

 {

     Table   table;

     copy_table_to_host(dev_table, table);

     int count = ;

     for (size_t i = ; i < table.count; i++)

     {

         Entry   *current = table.entries[i];

         while (current != NULL)

         {

             ++count;

             if (hash(current->key, table.count) != i)

                 printf("%d hashed to %ld, but was located at %ld\n", current->key, hash(current->key, table.count), i);

             current = current->next;

         }

     }

     if (count != ELEMENTS)

         printf("%d elements found in hash table.  Should be %ld\n", count, ELEMENTS);

     else

         printf("All %d elements found in hash table.\n", count);

 }

 int main(void)

 {

     unsigned int *buffer = (unsigned int*)big_random_block(SIZE);

     unsigned int *dev_keys;

     void         **dev_values;

     cudaMalloc((void**)&dev_keys, SIZE);

     cudaMalloc((void**)&dev_values, SIZE);

     cudaMemcpy(dev_keys, buffer, SIZE, cudaMemcpyHostToDevice);

     Table table;

     initialize_table(table, HASH_ENTRIES, ELEMENTS);

     Lock    lock[HASH_ENTRIES];// 准备锁列表

     Lock    *dev_lock;

     cudaMalloc((void**)&dev_lock, HASH_ENTRIES * sizeof(Lock));

     cudaMemcpy(dev_lock, lock, HASH_ENTRIES * sizeof(Lock), cudaMemcpyHostToDevice);

     cudaEvent_t     start, stop;

     cudaEventCreate(&start);

     cudaEventCreate(&stop);

     cudaEventRecord(start, );

     add_to_table << <,  >> >(dev_keys, dev_values, table, dev_lock);

     cudaEventRecord(stop, );

     cudaEventSynchronize(stop);

     float   elapsedTime;

     cudaEventElapsedTime(&elapsedTime, start, stop);

     printf("Time to hash:  %3.1f ms\n", elapsedTime);

     verify_table(table);

     free_table(table);

     cudaEventDestroy(start);

     cudaEventDestroy(stop);

     free_table(table);

     cudaFree(dev_lock);

     cudaFree(dev_keys);

     cudaFree(dev_values);

     free(buffer);

     getchar();

     return ;

 }

《GPU高性能编程CUDA实战》附录二散列表的更多相关文章

[问题解决]《GPU高性能编程CUDA实战》中第4章Julia实例&OpenCurlyDoubleQuote;显示器驱动已停止响应，并且已恢复”问题的解决方法
以下问题的出现及解决都基于"WIN7+CUDA7.5". 问题描述:当我编译运行<GPU高性能编程CUDA实战>中第4章所给Julia实例代码时,出现了显示器闪动的现象 ...
《GPU高性能编程CUDA实战》附录四其他头文件
▶ cpu_bitmap.h #ifndef __CPU_BITMAP_H__ #define __CPU_BITMAP_H__ #include "gl_helper.h" st ...
《GPU高性能编程CUDA实战》附录一高级原子操作
▶ 本章介绍了手动实现原子操作.重构了第五章向量点积的过程.核心是通过定义结构Lock及其运算,实现锁定,读写,解锁的过程. ● 章节代码 #include <stdio.h> #incl ...
《GPU高性能编程CUDA实战》附录三关于book&period;h
▶ 本书中用到的公用函数放到了头文件book.h中 #ifndef __BOOK_H__ #define __BOOK_H__ #include <stdio.h> #include &l ...
《GPU高性能编程CUDA实战》第五章线程并行
▶ 本章介绍了线程并行,并给出四个例子.长向量加法.波纹效果.点积和显示位图. ● 长向量加法(线程块并行 + 线程并行) #include <stdio.h> #include &quo ...
《GPU高性能编程CUDA实战》第十一章多GPU系统的CUDA C
▶ 本章介绍了多设备胸膛下的 CUDA 编程,以及一些特殊存储类型对计算速度的影响 ● 显存和零拷贝内存的拷贝与计算对比 #include <stdio.h> #include &quot ...
《GPU高性能编程CUDA实战》第七章纹理内存
▶ 本章介绍了纹理内存的使用,并给出了热传导的两个个例子.分别使用了一维和二维纹理单元. ● 热传导(使用一维纹理) #include <stdio.h> #include "c ...
《GPU高性能编程CUDA实战》第四章简单的线程块并行
▶ 本章介绍了线程块并行,并给出两个例子:长向量加法和绘制julia集. ● 长向量加法,中规中矩的GPU加法,包含申请内存和显存,赋值,显存传入,计算,显存传出,处理结果,清理内存和显存.用到了 t ...
《GPU高性能编程CUDA实战》第八章图形互操作性
▶ OpenGL与DirectX,等待填坑. ● basic_interop #include <stdio.h> #include "cuda_runtime.h" ...

随机推荐

SAP接口编程之 JCo3&period;0系列(04) &colon; 会话管理
在SAP接口编程之 NCo3.0系列(06) : 会话管理这篇文章中,对会话管理的相关知识点已经说得很详细了,请参考.现在用JCo3.0来实现. 1. JCoContext 如果SAP中多个函数需要 ...
无状态服务（stateless service）
一.定义无状态服务(stateless service)对单次请求的处理,不依赖其他请求,也就是说,处理一次请求所需的全部信息,要么都包含在这个请求里,要么可以从外部获取到(比如说数据库),服务器本 ...
ls 显示目录和文件的技巧
摘自 http://hi.baidu.com/zaoyuan1217/blog/item/fd69575660366b4fd10906b9.html 要列出当前目录下所有的文件名和目录名直接使用ls命 ...
hdu 4714
一个树形dp的题,又是一个涉及不深的领域 = =: 不过在网上看到了大神用很巧的思路解决了这个题: 大神的思路就是: 从树的底部往上看:如果一棵子树拥有两个及以上的叶子节点,可以将这棵子树与大树分离 ...
（图文教程）帝国cms7&period;0列表页模板调用多说评论次数
多说是站长朋友们常用的一款社会化评论插件.这里为大家介绍一下帝国列表页调用多说评论次数的方法. 文章由谢寒执笔.博客地址:www.cnblogs.com/officexie/: 1.首先在内容页模板中 ...
xmlplus 组件设计系列之七 - 路由
在浏览器端,对路由的理解一般是根据不同的 URL 完成页面的切换.在服务器端,则是根据不同的 URL 请求回馈相关的页面.在本章,我们讲述的是根据接收到的不同命令,路由组件呈现出不同的页面,这算是广义 ...
PHP把2个二维数组合并一个二维数组
$a = array(0 => Array(id => 66,class_name => www.iiwnet.com),1 => Array(id => 67,clas ...
SPARK-18560
##### Receiver data can not be dataSerialized properly. ``` // :: ERROR executor.Executor: Exception ...
memcached命令行、Memcached数据导出和导入、php连接memcache、php的session存储到memcached
1.memcached命令行 telnet 127.0.0.1 11211set key2 0 30 2abSTOREDget key2VALUE key2 0 2abEND 如: set key3 ...
request&period;getServletPath()，request&period;getContextPath()
2018-11-24 16:34:33 1. getServletPath():获取能够与“url-pattern”中匹配的路径,注意是完全匹配的部分,*的部分不包括. 2. getPageInfo ...

《GPU高性能编程CUDA实战》附录二 散列表

《GPU高性能编程CUDA实战》附录二 散列表的更多相关文章

随机推荐

相关文章

《GPU高性能编程CUDA实战》附录二散列表

《GPU高性能编程CUDA实战》附录二散列表的更多相关文章