1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
| | #include <cstdio>
#include <cutil_inline.h>
#include <cutil.h>
#include "rx_test_kernel.cu"
extern "C"
{
void CuSetDevice(int argc, char **argv)
{
if(cutCheckCmdLineFlag(argc, (const char**)argv, "device")){
cutilDeviceInit(argc, argv);
}
else{
cudaSetDevice( cutGetMaxGflopsDeviceId() );
}
}
void CuSetDeviceByID(int id)
{
int device_count = 0;
cudaGetDeviceCount(&device_count);
if(id < 0 || id >= device_count){
id = cutGetMaxGflopsDeviceId();
cudaSetDevice(id);
}
else{
cudaSetDevice(id);
}
}
void CuDeviceProp(void)
{
int n; cutilSafeCall(cudaGetDeviceCount(&n));
for(int i = 0; i < n; ++i){
cudaDeviceProp dev;
cutilSafeCall(cudaGetDeviceProperties(&dev, i));
printf("device %d\n", i);
printf(" device name : %s\n", dev.name);
printf(" total global memory : %d (MB)\n", dev.totalGlobalMem/1024/1024);
printf(" shared memory / block : %d (KB)\n", dev.sharedMemPerBlock/1024);
printf(" register / block : %d\n", dev.regsPerBlock);
printf(" warp size : %d\n", dev.warpSize);
printf(" max pitch : %d (B)\n", dev.memPitch);
printf(" max threads / block : %d\n", dev.maxThreadsPerBlock);
printf(" max size of each dim. of block : (%d, %d, %d)\n", dev.maxThreadsDim[0], dev.maxThreadsDim[1], dev.maxThreadsDim[2]);
printf(" max size of each dim. of grid : (%d, %d, %d)\n", dev.maxGridSize[0], dev.maxGridSize[1], dev.maxGridSize[2]);
printf(" clock rate : %d (MHz)\n", dev.clockRate/1000);
printf(" total constant memory : %d (KB)\n", dev.totalConstMem/1024);
printf(" compute capability : %d.%d\n", dev.major, dev.minor);
printf(" alignment requirement for texture : %d\n", dev.textureAlignment);
printf(" device overlap : %s\n", (dev.deviceOverlap ? "ok" : "not"));
printf(" num. of multiprocessors : %d\n", dev.multiProcessorCount);
printf(" kernel execution timeout : %s\n", (dev.kernelExecTimeoutEnabled ? "on" : "off"));
printf(" integrated : %s\n", (dev.integrated ? "on" : "off"));
printf(" host memory mapping : %s\n", (dev.canMapHostMemory ? "on" : "off"));
printf(" compute mode : ");
if(dev.computeMode == cudaComputeModeDefault) printf("default mode (multiple threads can use) \n");
else if(dev.computeMode == cudaComputeModeExclusive) printf("exclusive mode (only one thread will be able to use)\n");
else if(dev.computeMode == cudaComputeModeProhibited) printf("prohibited mode (no threads can use)\n");
}
printf("Device with Maximum GFLOPS : %d\n", cutGetMaxGflopsDeviceId());
}
uint Ceil(uint a, uint b)
{
return (a%b != 0) ? (a/b+1) : (a/b);
}
void CuSquareRoot(float *hS, unsigned long n)
{
float *dS = 0;
cutilSafeCall(cudaMalloc((void**)&dS, n*sizeof(float)));
uint block; dim3 grid; block = THREAD_NUM;
grid = dim3(Ceil(n, block), 1, 1);
if(grid.x > 65535){
grid.y = grid.x/32768;
grid.x = 32768;
}
printf("grid : (%d, %d, %d), block : %d\n", grid.x, grid.y, grid.z, block);
calSquareRoot<<< grid, block >>>(dS, n);
cutilCheckMsg("calSquareRoot kernel execution failed"); cutilSafeCall(cudaThreadSynchronize());
cutilSafeCall(cudaMemcpy((void*)hS, (void*)dS, n*sizeof(float), cudaMemcpyDeviceToHost));
if(dS) cutilSafeCall(cudaFree(dS));
}
}
|