[go: up one dir, main page]

0% found this document useful (0 votes)
33 views9 pages

PDC Assignment

Uploaded by

maryamasad668
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
33 views9 pages

PDC Assignment

Uploaded by

maryamasad668
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 9

Name: Maryam Asad

Reg. no.: COSC211101083

Course Instructor: Sir Ahsan Aslam

Due Date: 30th December, 2024 (14:00 hours)


1. Host to Device Code Conversions

Example 1: Basic Vector Addition

Host Code (CPU)

#include <iostream>

void vectorAdd(float* A, float* B, float* C, int N) {


for (int i = 0; i < N; ++i) {
C[i] = A[i] + B[i];
}
}

int main() {
int N = 1000;
float A[N], B[N], C[N];
// Initialize A and B arrays
vectorAdd(A, B, C, N);
return 0;
}

Device Code (GPU)

#include <iostream>

__global__ void vectorAddKernel(float* A, float* B, float* C, int N) {


int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
C[idx] = A[idx] + B[idx];
}
}

int main() {
int N = 1000;
float *d_A, *d_B, *d_C;
float A[N], B[N], C[N];

// Allocate device memory


cudaMalloc((void**)&d_A, N * sizeof(float));
cudaMalloc((void**)&d_B, N * sizeof(float));
cudaMalloc((void**)&d_C, N * sizeof(float));

// Copy data to device


cudaMemcpy(d_A, A, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, N * sizeof(float), cudaMemcpyHostToDevice);

// Launch the kernel


vectorAddKernel<<<(N + 255) / 256, 256>>>(d_A, d_B, d_C, N);

// Copy result back to host


cudaMemcpy(C, d_C, N * sizeof(float), cudaMemcpyDeviceToHost);

// Free device memory


cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

return 0;
}

Example 2: Array Initialization and Summation

Host Code (CPU)

#include <iostream>

int main() {
const int N = 1000;
int A[N];
int sum = 0;

// Initialize array A
for (int i = 0; i < N; ++i) {
A[i] = i;
}

// Calculate sum of A
for (int i = 0; i < N; ++i) {
sum += A[i];
}

std::cout << "Sum: " << sum << std::endl;


return 0;
}

Device Code (GPU)

#include <iostream>

__global__ void sumKernel(int* A, int* sum, int N) {


int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
atomicAdd(sum, A[idx]);
}
}

int main() {
const int N = 1000;
int *d_A, *d_sum;
int A[N];
int sum = 0;

// Initialize array A
for (int i = 0; i < N; ++i) {
A[i] = i;
}

// Allocate device memory


cudaMalloc((void**)&d_A, N * sizeof(int));
cudaMalloc((void**)&d_sum, sizeof(int));

// Initialize sum to 0 on device


cudaMemcpy(d_sum, &sum, sizeof(int), cudaMemcpyHostToDevice);
// Copy data to device
cudaMemcpy(d_A, A, N * sizeof(int), cudaMemcpyHostToDevice);

// Launch kernel
sumKernel<<<(N + 255) / 256, 256>>>(d_A, d_sum, N);

// Copy result back to host


cudaMemcpy(&sum, d_sum, sizeof(int), cudaMemcpyDeviceToHost);

std::cout << "Sum: " << sum << std::endl;

// Free device memory


cudaFree(d_A);
cudaFree(d_sum);

return 0;
}

Example 3: Parallel Array Scaling

Host Code (CPU)

#include <iostream>

void scaleArray(int* A, int scale, int N) {


for (int i = 0; i < N; ++i) {
A[i] *= scale;
}
}

int main() {
const int N = 1000;
int A[N];

// Initialize array A
for (int i = 0; i < N; ++i) {
A[i] = i;
}

int scale = 2;
scaleArray(A, scale, N);

return 0;
}

Device Code (GPU)

#include <iostream>

__global__ void scaleArrayKernel(int* A, int scale, int N) {


int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
A[idx] *= scale;
}
}

int main() {
const int N = 1000;
int *d_A;
int A[N];

// Initialize array A
for (int i = 0; i < N; ++i) {
A[i] = i;
}

int scale = 2;

// Allocate device memory


cudaMalloc((void**)&d_A, N * sizeof(int));

// Copy data to device


cudaMemcpy(d_A, A, N * sizeof(int), cudaMemcpyHostToDevice);

// Launch kernel
scaleArrayKernel<<<(N + 255) / 256, 256>>>(d_A, scale, N);

// Copy result back to host


cudaMemcpy(A, d_A, N * sizeof(int), cudaMemcpyDeviceToHost);

// Free device memory


cudaFree(d_A);

return 0;
}

Example 4: Matrix Transposition

Host Code (CPU)

#include <iostream>

void transposeMatrix(int* A, int* B, int N) {


for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
B[j * N + i] = A[i * N + j];
}
}
}

int main() {
const int N = 3;
int A[N * N] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
int B[N * N];

transposeMatrix(A, B, N);

return 0;
}

Device Code (GPU)

#include <iostream>

__global__ void transposeMatrixKernel(int* A, int* B, int N) {


int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;

if (idx < N && idy < N) {


B[idy * N + idx] = A[idx * N + idy];
}
}

int main() {
const int N = 3;
int *d_A, *d_B;
int A[N * N] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
int B[N * N];

// Allocate device memory


cudaMalloc((void**)&d_A, N * N * sizeof(int));
cudaMalloc((void**)&d_B, N * N * sizeof(int));

// Copy data to device


cudaMemcpy(d_A, A, N * N * sizeof(int), cudaMemcpyHostToDevice);

// Launch kernel
dim3 block(1, 1);
dim3 grid(N, N);
transposeMatrixKernel<<<grid, block>>>(d_A, d_B, N);

// Copy result back to host


cudaMemcpy(B, d_B, N * N * sizeof(int), cudaMemcpyDeviceToHost);

// Free device memory


cudaFree(d_A);
cudaFree(d_B);

return 0;
}

Example 5: Finding the Maximum Value

Host Code (CPU)

#include <iostream>

int findMax(int* A, int N) {


int maxVal = A[0];
for (int i = 1; i < N; ++i) {
if (A[i] > maxVal) {
maxVal = A[i];
}
}
return maxVal;
}

int main() {
const int N = 1000;
int A[N];

// Initialize array A
for (int i = 0; i < N; ++i) {
A[i] = i;
}

int maxVal = findMax(A, N);


std::cout << "Max Value: " << maxVal << std::endl;

return 0;
}

Device Code (GPU)

#include <iostream>

__global__ void findMaxKernel(int* A, int* maxVal, int N) {


int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
atomicMax(maxVal, A[idx]);
}
}

int main() {
const int N = 1000;
int *d_A, *d_maxVal;
int A[N];
int maxVal = 0;

// Initialize array A
for (int i = 0; i < N; ++i) {
A[i] = i;
}

// Allocate device memory


cudaMalloc((void**)&d_A, N * sizeof(int));
cudaMalloc((void**)&d_maxVal, sizeof(int));

// Initialize maxVal to 0 on device


cudaMemcpy(d_maxVal, &maxVal, sizeof(int), cudaMemcpyHostToDevice);

// Copy data to device


cudaMemcpy(d_A, A, N * sizeof(int), cudaMemcpyHostToDevice);

// Launch kernel
findMaxKernel<<<(N + 255) / 256, 256>>>(d_A, d_maxVal, N);

// Copy result back to host


cudaMemcpy(&maxVal, d_maxVal, sizeof(int), cudaMemcpyDeviceToHost);

std::cout << "Max Value: " << maxVal << std::endl;

// Free device memory


cudaFree(d_A);
cudaFree(d_maxVal);

return 0;
}

2. Matrix Multiplication Examples in Device Code


Matrix Multiplication (Example 1)

Device Code (GPU)

#include <iostream>

__global__ void matrixMultiplyKernel(int* A, int* B, int* C, int N) {


int row = threadIdx.x + blockIdx.x * blockDim.x;
int col = threadIdx.y + blockIdx.y * blockDim.y;
if (row < N && col < N) {
int val = 0;
for (int k = 0; k < N; ++k) {
val += A[row * N + k] * B[k * N + col];
}
C[row * N + col] = val;
}
}

int main() {
const int N = 3;
int *d_A, *d_B, *d_C;
int A[N * N] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
int B[N * N] = {9, 8, 7, 6, 5, 4, 3, 2, 1};
int C[N * N] = {0};

// Allocate device memory


cudaMalloc((void**)&d_A, N * N * sizeof(int));
cudaMalloc((void**)&d_B, N * N * sizeof(int));
cudaMalloc((void**)&d_C, N * N * sizeof(int));

// Copy data to device


cudaMemcpy(d_A, A, N * N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, N * N * sizeof(int), cudaMemcpyHostToDevice);

// Launch kernel
dim3 block(1, 1);
dim3 grid(N, N);
matrixMultiplyKernel<<<grid, block>>>(d_A, d_B, d_C, N);

// Copy result back to host


cudaMemcpy(C, d_C, N * N * sizeof(int), cudaMemcpyDeviceToHost);

// Free device memory


cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

return 0;
}

Matrix Multiplication (Example 2)

Device Code (GPU)

#include <iostream>

__global__ void matrixMultiplyKernel(int* A, int* B, int* C, int N) {


int row = threadIdx.x + blockIdx.x * blockDim.x;
int col = threadIdx.y + blockIdx.y * blockDim.y;
if (row < N && col < N) {
int value = 0;
for (int i = 0; i < N; i++) {
value += A[row * N + i] * B[i * N + col];
}
C[row * N + col] = value;
}
}

int main() {
const int N = 4;
int *d_A, *d_B, *d_C;
int A[N * N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
int B[N * N] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
int C[N * N] = {0};

// Allocate device memory


cudaMalloc((void**)&d_A, N * N * sizeof(int));
cudaMalloc((void**)&d_B, N * N * sizeof(int));
cudaMalloc((void**)&d_C, N * N * sizeof(int));

// Copy data to device


cudaMemcpy(d_A, A, N * N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, N * N * sizeof(int), cudaMemcpyHostToDevice);

// Launch kernel
dim3 block(2, 2);
dim3 grid(N / 2, N / 2);
matrixMultiplyKernel<<<grid, block>>>(d_A, d_B, d_C, N);

// Copy result back to host


cudaMemcpy(C, d_C, N * N * sizeof(int), cudaMemcpyDeviceToHost);

// Free device memory


cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

return 0;
}

You might also like