5
5
#include < cuda.h>
6
6
#include < cuda_runtime.h>
7
7
8
- __device__ int _inv (float *m, float *invOut);
9
- __device__ void mult (float *A, float *B, float *C);
10
- __device__ void copy (float *A, float *B);
11
- __device__ void _eye (float *data);
8
+ __device__ int _inv (double *m, double *invOut);
9
+ __device__ void mult (double *A, double *B, double *C);
10
+ __device__ void copy (double *A, double *B);
11
+ __device__ void _eye (double *data);
12
12
13
13
14
14
@@ -24,25 +24,25 @@ __device__ void _eye(float *data);
24
24
* cdim: (int) number of joints
25
25
* out: (N, 6, cdim)
26
26
*/
27
- __global__ void _jacob0 (float *T,
28
- float *tool,
29
- float *e_tool,
30
- float *link_A,
27
+ __global__ void _jacob0 (double *T,
28
+ double *tool,
29
+ double *e_tool,
30
+ double *link_A,
31
31
int *link_axes,
32
32
int *link_isjoint,
33
33
int N,
34
34
int cdim,
35
- float *out)
35
+ double *out)
36
36
{
37
37
int tid = blockIdx .x * blockDim .x + threadIdx .x ;
38
- float *T_i, *tool_i;
39
- float *U, *temp, *etool_i;
40
- float *invU;
41
- float *link_iA;
42
-
43
- cudaMalloc ((void **)&U, sizeof (float ) * 16 );
44
- cudaMalloc ((void **)&invU, sizeof (float ) * 16 );
45
- cudaMalloc ((void **)&temp, sizeof (float ) * 16 );
38
+ double *T_i, *tool_i;
39
+ double *U, *temp, *etool_i;
40
+ double *invU;
41
+ double *link_iA;
42
+
43
+ cudaMalloc ((void **)&U, sizeof (double ) * 16 );
44
+ cudaMalloc ((void **)&invU, sizeof (double ) * 16 );
45
+ cudaMalloc ((void **)&temp, sizeof (double ) * 16 );
46
46
int j = 0 ;
47
47
48
48
T_i = &T[tid * 16 ];
@@ -65,7 +65,7 @@ __global__ void _jacob0(float *T,
65
65
_inv (U, invU);
66
66
mult (invU, T_i, temp);
67
67
68
- float *out_tid = &out[tid + 16 ];
68
+ double *out_tid = &out[tid + 16 ];
69
69
70
70
if (link_axes[i] == 0 ) {
71
71
out_tid[0 * tid + j] = U[0 * 4 + 2 ] * temp[1 * 4 + 3 ] - U[0 * 4 + 1 ] * temp[2 * 4 + 3 ];
@@ -134,7 +134,7 @@ __global__ void _jacob0(float *T,
134
134
}
135
135
136
136
137
- __device__ void _eye (float *data)
137
+ __device__ void _eye (double *data)
138
138
{
139
139
data[0 ] = 1 ;
140
140
data[1 ] = 0 ;
@@ -154,7 +154,7 @@ __device__ void _eye(float *data)
154
154
data[15 ] = 1 ;
155
155
}
156
156
157
- __device__ void copy (float *A, float *B)
157
+ __device__ void copy (double *A, double *B)
158
158
{
159
159
// copy A into B
160
160
B[0 ] = A[0 ];
@@ -175,7 +175,7 @@ __device__ void copy(float *A, float *B)
175
175
B[15 ] = A[15 ];
176
176
}
177
177
178
- __device__ void mult (float *A, float *B, float *C)
178
+ __device__ void mult (double *A, double *B, double *C)
179
179
{
180
180
const int N = 4 ;
181
181
int i, j, k;
@@ -195,10 +195,10 @@ __device__ void mult(float *A, float *B, float *C)
195
195
}
196
196
}
197
197
198
- __device__ int _inv (float *m, float *invOut)
198
+ __device__ int _inv (double *m, double *invOut)
199
199
{
200
- float *inv;
201
- cudaMalloc ((void **)&inv, sizeof (float ) * 16 );
200
+ double *inv;
201
+ cudaMalloc ((void **)&inv, sizeof (double ) * 16 );
202
202
double det;
203
203
int i;
204
204
@@ -329,6 +329,9 @@ __device__ int _inv(float *m, float *invOut)
329
329
}
330
330
331
331
332
+
333
+ extern " C" {
334
+
332
335
/*
333
336
* Params
334
337
* T: (N, 4, 4) the final transform matrix of all points (shared)
F438
@@ -340,42 +343,42 @@ __device__ int _inv(float *m, float *invOut)
340
343
* cdim: (int) number of joints
341
344
* out: (N, 6, cdim)
342
345
*/
343
- void jacob0 (float *T,
344
- float *tool,
345
- float *etool,
346
- float *link_A,
346
+ void jacob0 (double *T,
347
+ double *tool,
348
+ double *etool,
349
+ double *link_A,
347
350
int *link_axes,
348
351
int *link_isjoint,
349
352
int N,
350
353
int cdim,
351
- float *out)
354
+ double *out)
352
355
// affine_T[N]
353
356
// link_axes[cdim]
354
357
// link_A[cdim]
355
358
// link_isjoint[cdim]
356
359
// out
357
360
{
358
- float *d_T, *d_tool, *d_etool, *d_link_A;
361
+ double *d_T, *d_tool, *d_etool, *d_link_A;
359
362
int *d_link_axes, *d_link_isjoint;
360
- float *d_out;
363
+ double *d_out;
361
364
362
- cudaMalloc ((void **)&d_T, sizeof (float ) * N * 16 );
363
- cudaMalloc ((void **)&d_tool, sizeof (float ) * N * 16 );
364
- cudaMalloc ((void **)&d_etool, sizeof (float ) * N * 16 );
365
- cudaMalloc ((void **)&d_link_A, sizeof (float ) * cdim * 16 );
365
+ cudaMalloc ((void **)&d_T, sizeof (double ) * N * 16 );
366
+ cudaMalloc ((void **)&d_tool, sizeof (double ) * N * 16 );
367
+ cudaMalloc ((void **)&d_etool, sizeof (double ) * N * 16 );
368
+ cudaMalloc ((void **)&d_link_A, sizeof (double ) * cdim * 16 );
366
369
cudaMalloc ((void **)&d_link_axes, sizeof (int ) * cdim);
367
370
cudaMalloc ((void **)&d_link_isjoint, sizeof (int ) * cdim);
368
- cudaMalloc ((void **)&d_out, sizeof (float ) * 6 * cdim);
371
+ cudaMalloc ((void **)&d_out, sizeof (double ) * 6 * cdim);
369
372
370
373
371
374
// Transfer data from host to device memory
372
- cudaMemcpy (d_T, T, sizeof (float ) * N * 16 , cudaMemcpyHostToDevice);
373
- cudaMemcpy (d_tool, tool, sizeof (float ) * N * 16 , cudaMemcpyHostToDevice);
374
- cudaMemcpy (d_etool, etool, sizeof (float ) * N * 16 , cudaMemcpyHostToDevice);
375
- cudaMemcpy (d_link_A, link_A, sizeof (float ) * cdim * 16 , cudaMemcpyHostToDevice);
375
+ cudaMemcpy (d_T, T, sizeof (double ) * N * 16 , cudaMemcpyHostToDevice);
376
+ cudaMemcpy (d_tool, tool, sizeof (double ) * N * 16 , cudaMemcpyHostToDevice);
377
+ cudaMemcpy (d_etool, etool, sizeof (double ) * N * 16 , cudaMemcpyHostToDevice);
378
+ cudaMemcpy (d_link_A, link_A, sizeof (double ) * cdim * 16 , cudaMemcpyHostToDevice);
376
379
cudaMemcpy (d_link_axes, link_axes, sizeof (int ) * cdim, cudaMemcpyHostToDevice);
377
380
cudaMemcpy (d_link_isjoint, link_isjoint, sizeof (int ) * cdim, cudaMemcpyHostToDevice);
378
- cudaMemcpy (d_out, out, sizeof (float ) * 6 * cdim, cudaMemcpyHostToDevice);
381
+ cudaMemcpy (d_out, out, sizeof (double ) * 6 * cdim, cudaMemcpyHostToDevice);
379
382
380
383
381
384
int block_size = 256 ;
@@ -390,7 +393,10 @@ void jacob0(float *T,
390
393
cdim,
391
394
d_out);
392
395
393
- cudaMemcpy (out, d_out, sizeof (float ) * 6 * cdim, cudaMemcpyDeviceToHost);
396
+ // memset(out, 1, N * 6 * cdim);
397
+ // out[0] = 1;
398
+ cudaMemcpy (out, d_out, sizeof (double ) * 6 * cdim, cudaMemcpyDeviceToHost);
399
+ printf (" Out size %d %d %f %f %f %f %f" , N, cdim, out[0 ], out[1 ], out[2 ], out[3 ], out[4 ]);
394
400
395
401
// Deallocate device memory
396
402
cudaFree (d_T);
@@ -400,4 +406,7 @@ void jacob0(float *T,
400
406
cudaFree (d_link_axes);
401
407
cudaFree (d_link_isjoint);
402
408
cudaFree (d_out);
403
- }
409
+ }
410
+
411
+
412
+ }// extern "C"
0 commit comments