1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
| | void CuStreamTest(int max_num)
{
int nstream = 2;
int n = max_num/nstream;
int size = n*sizeof(int);
int **dInData = (int**)malloc(nstream*sizeof(int*));
int **hInData = (int**)malloc(nstream*sizeof(int*));
for(int i = 0; i < nstream; ++i){
cutilSafeCall(cudaMallocHost((void**)&hInData[i], size));
cutilSafeCall(cudaMalloc((void**)&dInData[i], size));
for(int j = 0; j < n; ++j){
hInData[i][j] = i*n+j;
}
}
uint numThreads, numBlocks;
numThreads = min(64, n);
numBlocks = iDivUp(n, numThreads);
int smemSize = numThreads*sizeof(int);
int **hOutData = (int**)malloc(nstream*sizeof(int*));
int **dOutData = (int**)malloc(nstream*sizeof(int*));
for(int i = 0; i < nstream; ++i){
cutilSafeCall(cudaMallocHost((void**)&hOutData[i], numBlocks*sizeof(int)));
cutilSafeCall(cudaMalloc((void**)&dOutData[i], numBlocks*sizeof(int)));
}
cudaStream_t *streams = (cudaStream_t*)malloc(nstream*sizeof(cudaStream_t));
for(int i = 0; i < nstream; ++i){
cutilSafeCall(cudaStreamCreate(&streams[i]));
}
for(int i = 0; i < nstream; ++i){
cutilSafeCall(cudaMemcpyAsync(dInData[i], hInData[i], size, cudaMemcpyHostToDevice, streams[i]));
}
for(int i = 0; i < nstream; ++i){
reduce2<int><<< numBlocks, numThreads, smemSize, streams[i] >>>(dInData[i], dOutData[i], n);
}
for(int i = 0; i < nstream; ++i){
cutilSafeCall(cudaMemcpyAsync(hOutData[i], dOutData[i], numBlocks*sizeof(int), cudaMemcpyDeviceToHost, streams[i]));
}
cudaThreadSynchronize();
int sum_gpu = 0;
for(int i = 0; i < nstream; ++i){
for(int j = 0; j < numBlocks; ++j){
sum_gpu += hOutData[i][j];
}
}
int sum_cpu = 0;
for(int j = 0; j < n*nstream; ++j){
sum_cpu += j;
}
printf("gpu : %d\n", sum_gpu);
printf("cpu : %d\n", sum_cpu);
for(int i = 0; i < nstream; i++){
cudaStreamDestroy(streams[i]);
}
for(int i = 0; i < nstream; i++){
cudaFreeHost(hInData[i]);
cudaFreeHost(hOutData[i]);
cudaFree(dInData[i]);
cudaFree(dOutData[i]);
}
cudaThreadExit();
}
|