Name: Maryam Asad
Reg. no.: COSC211101083
Course Instructor: Sir Ahsan Aslam
Due Date: 30th December, 2024 (14:00 hours)
1. Host to Device Code Conversions
Example 1: Basic Vector Addition
Host Code (CPU)
#include <iostream>
void vectorAdd(float* A, float* B, float* C, int N) {
for (int i = 0; i < N; ++i) {
C[i] = A[i] + B[i];
}
}
int main() {
int N = 1000;
float A[N], B[N], C[N];
// Initialize A and B arrays
vectorAdd(A, B, C, N);
return 0;
}
Device Code (GPU)
#include <iostream>
__global__ void vectorAddKernel(float* A, float* B, float* C, int N) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
C[idx] = A[idx] + B[idx];
}
}
int main() {
int N = 1000;
float *d_A, *d_B, *d_C;
float A[N], B[N], C[N];
// Allocate device memory
cudaMalloc((void**)&d_A, N * sizeof(float));
cudaMalloc((void**)&d_B, N * sizeof(float));
cudaMalloc((void**)&d_C, N * sizeof(float));
// Copy data to device
cudaMemcpy(d_A, A, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, N * sizeof(float), cudaMemcpyHostToDevice);
// Launch the kernel
vectorAddKernel<<<(N + 255) / 256, 256>>>(d_A, d_B, d_C, N);
// Copy result back to host
cudaMemcpy(C, d_C, N * sizeof(float), cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return 0;
}
Example 2: Array Initialization and Summation
Host Code (CPU)
#include <iostream>
int main() {
const int N = 1000;
int A[N];
int sum = 0;
// Initialize array A
for (int i = 0; i < N; ++i) {
A[i] = i;
}
// Calculate sum of A
for (int i = 0; i < N; ++i) {
sum += A[i];
}
std::cout << "Sum: " << sum << std::endl;
return 0;
}
Device Code (GPU)
#include <iostream>
__global__ void sumKernel(int* A, int* sum, int N) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
atomicAdd(sum, A[idx]);
}
}
int main() {
const int N = 1000;
int *d_A, *d_sum;
int A[N];
int sum = 0;
// Initialize array A
for (int i = 0; i < N; ++i) {
A[i] = i;
}
// Allocate device memory
cudaMalloc((void**)&d_A, N * sizeof(int));
cudaMalloc((void**)&d_sum, sizeof(int));
// Initialize sum to 0 on device
cudaMemcpy(d_sum, &sum, sizeof(int), cudaMemcpyHostToDevice);
// Copy data to device
cudaMemcpy(d_A, A, N * sizeof(int), cudaMemcpyHostToDevice);
// Launch kernel
sumKernel<<<(N + 255) / 256, 256>>>(d_A, d_sum, N);
// Copy result back to host
cudaMemcpy(&sum, d_sum, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "Sum: " << sum << std::endl;
// Free device memory
cudaFree(d_A);
cudaFree(d_sum);
return 0;
}
Example 3: Parallel Array Scaling
Host Code (CPU)
#include <iostream>
void scaleArray(int* A, int scale, int N) {
for (int i = 0; i < N; ++i) {
A[i] *= scale;
}
}
int main() {
const int N = 1000;
int A[N];
// Initialize array A
for (int i = 0; i < N; ++i) {
A[i] = i;
}
int scale = 2;
scaleArray(A, scale, N);
return 0;
}
Device Code (GPU)
#include <iostream>
__global__ void scaleArrayKernel(int* A, int scale, int N) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
A[idx] *= scale;
}
}
int main() {
const int N = 1000;
int *d_A;
int A[N];
// Initialize array A
for (int i = 0; i < N; ++i) {
A[i] = i;
}
int scale = 2;
// Allocate device memory
cudaMalloc((void**)&d_A, N * sizeof(int));
// Copy data to device
cudaMemcpy(d_A, A, N * sizeof(int), cudaMemcpyHostToDevice);
// Launch kernel
scaleArrayKernel<<<(N + 255) / 256, 256>>>(d_A, scale, N);
// Copy result back to host
cudaMemcpy(A, d_A, N * sizeof(int), cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A);
return 0;
}
Example 4: Matrix Transposition
Host Code (CPU)
#include <iostream>
void transposeMatrix(int* A, int* B, int N) {
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
B[j * N + i] = A[i * N + j];
}
}
}
int main() {
const int N = 3;
int A[N * N] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
int B[N * N];
transposeMatrix(A, B, N);
return 0;
}
Device Code (GPU)
#include <iostream>
__global__ void transposeMatrixKernel(int* A, int* B, int N) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;
if (idx < N && idy < N) {
B[idy * N + idx] = A[idx * N + idy];
}
}
int main() {
const int N = 3;
int *d_A, *d_B;
int A[N * N] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
int B[N * N];
// Allocate device memory
cudaMalloc((void**)&d_A, N * N * sizeof(int));
cudaMalloc((void**)&d_B, N * N * sizeof(int));
// Copy data to device
cudaMemcpy(d_A, A, N * N * sizeof(int), cudaMemcpyHostToDevice);
// Launch kernel
dim3 block(1, 1);
dim3 grid(N, N);
transposeMatrixKernel<<<grid, block>>>(d_A, d_B, N);
// Copy result back to host
cudaMemcpy(B, d_B, N * N * sizeof(int), cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
return 0;
}
Example 5: Finding the Maximum Value
Host Code (CPU)
#include <iostream>
int findMax(int* A, int N) {
int maxVal = A[0];
for (int i = 1; i < N; ++i) {
if (A[i] > maxVal) {
maxVal = A[i];
}
}
return maxVal;
}
int main() {
const int N = 1000;
int A[N];
// Initialize array A
for (int i = 0; i < N; ++i) {
A[i] = i;
}
int maxVal = findMax(A, N);
std::cout << "Max Value: " << maxVal << std::endl;
return 0;
}
Device Code (GPU)
#include <iostream>
__global__ void findMaxKernel(int* A, int* maxVal, int N) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
atomicMax(maxVal, A[idx]);
}
}
int main() {
const int N = 1000;
int *d_A, *d_maxVal;
int A[N];
int maxVal = 0;
// Initialize array A
for (int i = 0; i < N; ++i) {
A[i] = i;
}
// Allocate device memory
cudaMalloc((void**)&d_A, N * sizeof(int));
cudaMalloc((void**)&d_maxVal, sizeof(int));
// Initialize maxVal to 0 on device
cudaMemcpy(d_maxVal, &maxVal, sizeof(int), cudaMemcpyHostToDevice);
// Copy data to device
cudaMemcpy(d_A, A, N * sizeof(int), cudaMemcpyHostToDevice);
// Launch kernel
findMaxKernel<<<(N + 255) / 256, 256>>>(d_A, d_maxVal, N);
// Copy result back to host
cudaMemcpy(&maxVal, d_maxVal, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "Max Value: " << maxVal << std::endl;
// Free device memory
cudaFree(d_A);
cudaFree(d_maxVal);
return 0;
}
2. Matrix Multiplication Examples in Device Code
Matrix Multiplication (Example 1)
Device Code (GPU)
#include <iostream>
__global__ void matrixMultiplyKernel(int* A, int* B, int* C, int N) {
int row = threadIdx.x + blockIdx.x * blockDim.x;
int col = threadIdx.y + blockIdx.y * blockDim.y;
if (row < N && col < N) {
int val = 0;
for (int k = 0; k < N; ++k) {
val += A[row * N + k] * B[k * N + col];
}
C[row * N + col] = val;
}
}
int main() {
const int N = 3;
int *d_A, *d_B, *d_C;
int A[N * N] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
int B[N * N] = {9, 8, 7, 6, 5, 4, 3, 2, 1};
int C[N * N] = {0};
// Allocate device memory
cudaMalloc((void**)&d_A, N * N * sizeof(int));
cudaMalloc((void**)&d_B, N * N * sizeof(int));
cudaMalloc((void**)&d_C, N * N * sizeof(int));
// Copy data to device
cudaMemcpy(d_A, A, N * N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, N * N * sizeof(int), cudaMemcpyHostToDevice);
// Launch kernel
dim3 block(1, 1);
dim3 grid(N, N);
matrixMultiplyKernel<<<grid, block>>>(d_A, d_B, d_C, N);
// Copy result back to host
cudaMemcpy(C, d_C, N * N * sizeof(int), cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return 0;
}
Matrix Multiplication (Example 2)
Device Code (GPU)
#include <iostream>
__global__ void matrixMultiplyKernel(int* A, int* B, int* C, int N) {
int row = threadIdx.x + blockIdx.x * blockDim.x;
int col = threadIdx.y + blockIdx.y * blockDim.y;
if (row < N && col < N) {
int value = 0;
for (int i = 0; i < N; i++) {
value += A[row * N + i] * B[i * N + col];
}
C[row * N + col] = value;
}
}
int main() {
const int N = 4;
int *d_A, *d_B, *d_C;
int A[N * N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
int B[N * N] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
int C[N * N] = {0};
// Allocate device memory
cudaMalloc((void**)&d_A, N * N * sizeof(int));
cudaMalloc((void**)&d_B, N * N * sizeof(int));
cudaMalloc((void**)&d_C, N * N * sizeof(int));
// Copy data to device
cudaMemcpy(d_A, A, N * N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, N * N * sizeof(int), cudaMemcpyHostToDevice);
// Launch kernel
dim3 block(2, 2);
dim3 grid(N / 2, N / 2);
matrixMultiplyKernel<<<grid, block>>>(d_A, d_B, d_C, N);
// Copy result back to host
cudaMemcpy(C, d_C, N * N * sizeof(int), cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return 0;
}