[go: up one dir, main page]

0% found this document useful (0 votes)
55 views3 pages

BECOA157 Parallel Matrix Multiplication

This document discusses parallel matrix multiplication using CUDA. It defines a GPU kernel called GPUmatmul that performs matrix multiplication by assigning each matrix element to a unique thread. The kernel is launched on the GPU with block and grid dimensions. The CPU implementation CPUmatmul is also provided for comparison. The main function initializes matrices, runs both CPU and GPU multiplication, times the runs, and checks the results. It shows the GPU implementation achieves significant speedup over the CPU version.

Uploaded by

mysql mysql
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
55 views3 pages

BECOA157 Parallel Matrix Multiplication

This document discusses parallel matrix multiplication using CUDA. It defines a GPU kernel called GPUmatmul that performs matrix multiplication by assigning each matrix element to a unique thread. The kernel is launched on the GPU with block and grid dimensions. The CPU implementation CPUmatmul is also provided for comparison. The main function initializes matrices, runs both CPU and GPU multiplication, times the runs, and checks the results. It shows the GPU implementation achieves significant speedup over the CPU version.

Uploaded by

mysql mysql
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 3

Parallel Matrix Multiplication using CUDA

#include <bits/stdc++.h>
using namespace std;

// Kernel function for matrix multiplication


_global_
void GPUmatmul(int N, double *x, double *y, double *ans)
{
//calculates unique thread ID in the block
int t=
(blockDim.x*blockDim.y)*threadIdx.z+(threadIdx.y*blockDim.x)+(threadIdx.x);
//calculates unique block ID in the grid
int b= (gridDim.x*gridDim.y)*blockIdx.z+(blockIdx.y*gridDim.x)+(blockIdx.x);
//block size (this is redundant though)
int T= blockDim.x*blockDim.y*blockDim.z;
//grid size (this is redundant though)
int B= gridDim.x*gridDim.y*gridDim.z;

/*
* Each cell in the matrix is assigned to a different thread.
* Each thread do O(N*number of asssigned cell) computation.
* Assigned cells of different threads does not overlape with
* each other. And so no need for synchronization.
*/

for (int i=b;i<N;i+=B)


{
for(int j=t;j<N;j+=T)
{
for(int k=0;k<N;k++)
{
ans[i*N+j]+=(x[i*N+k]*y[k*N+j]);
}
}
}
}

void CPUmatmul(int N,double *x, double *y, double *ans)


{
for(int i=0;i<N;i++)
{
for(int j=0;j<N;j++)
{
for(int k=0;k<N;k++)
{
ans[i*N+j]+=(x[i*N+k]*y[k*N+j]);
}
}
}
}

bool check(int N,double *ans)


{
for(int i=0;i<N;i++)
{
for(int j=0;j<N;j++)
{
if(ans[i*N+j]!=20.0)return false;
}
}
return true;
}

int main(void)
{
//size of matrix
int N = 1<<9;

double *x, *y, *ans;

// Allocate Unified Memory – accessible from CPU or GPU


cudaMallocManaged(&x, N*N*sizeof(double));
cudaMallocManaged(&y, N*N*sizeof(double));
cudaMallocManaged(&ans, N*N*sizeof(double));

// initialize x,y and ans arrays on the host


for (int i = 0; i < N; i++)
{
for(int j=0;j<N;j++)
{
x[i*N+j]=5;
y[i*N+j]=(i==j?1:0);
ans[i*N+j]=(double)0.000000000000;
}
}

clock_t t;
double avg=0;
cout<<"Strting CPU computation"<<endl;
for(int i=0;i<=3;i++)
{
t=clock();
CPUmatmul(N, x, y,ans);
t = clock() - t;
if(i)avg+=t; //we will ignore the first run
printf ("It took CPU-%d %f
ms.\n",i,(((double)t)/CLOCKS_PER_SEC)*1000);
}
avg/=3;
avg/=CLOCKS_PER_SEC;
avg*=1000;
printf ("It took %lf ms on avg.\n",avg);
if(check(N,ans))cout<<"RUN OK."<<endl;
else cout<<"RUN NOT OK."<<endl;

// initialize x,y and ans arrays on the host


for (int i = 0; i < N; i++)
{
for(int j=0;j<N;j++)
{
x[i*N+j]=5;
y[i*N+j]=(i==j?1:0);
ans[i*N+j]=(double)0.000000000000;
}
}
avg=0;
cout<<"Strting GPU computation"<<endl;
// Run kernel on GPU
for(int i=0;i<=3;i++)
{
t=clock();
GPUmatmul<<<dim3(16,16,16), dim3(16,8,8)>>>(N, x, y,ans);
cudaDeviceSynchronize();
t = clock() - t;
if(i)avg+=t; //we will ignore the first run
printf ("It took GPU-%d %f
ms.\n",i,(((double)t)/CLOCKS_PER_SEC)*1000);
}
avg/=3;
avg/=CLOCKS_PER_SEC;
avg*=1000;
printf ("It took %lf ms on avg.\n",avg);
if(check(N,ans))cout<<"RUN OK."<<endl;
else cout<<"RUN NOT OK."<<endl;

// Free memory
cudaFree(x);
cudaFree(y);
return 0;
}

You might also like