0% found this document useful (0 votes)

20 views10 pages

Lab7 GPU

This document discusses GPU implementations of matrix multiplication, focusing on optimization techniques such as naive implementation, shared memory tiling, and register tiling. Each method is designed to minimize global memory access and enhance computation throughput. The document provides code examples and performance results for each approach, demonstrating the efficiency improvements achieved through these optimizations.

Uploaded by

Amal

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

20 views10 pages

Lab7 GPU

Uploaded by

Amal

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 10

Matrix Multiplication on GPU: Optimizations and Insights

In this notebook, we explore GPU implementations of matrix

multiplication. We analyze various optimization techniques including:

- Naive implementation
- Shared memory tiling
- Register tiling (+ shared memory)

Each method aims to reduce global memory access and increase computation
throughput.

Preparing the required packages and initialisations

pip install nvcc4jupyter

Collecting nvcc4jupyter
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1

%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...

Source files will be saved in "/tmp/tmprk8r5l8j".

from nvcc4jupyter import set_defaults

set_defaults(compiler_args="-arch=sm_75")

## Setting the matrix size

MATRIX_SIZE=8192
SIZE_FILE = open("matrix.size", "w")
SIZE_FILE.write(str(MATRIX_SIZE))
SIZE_FILE.close()

Naive implementation

%%cuda
#include <iostream>
#include <cstdlib>
#include <cuda_runtime.h>
#include <fstream>

#define CUDA_CHECK(ans) { gpuAssert((ans), FILE, LINE); }

inline void gpuAssert(cudaError_t code, const char *file, int line) {

if (code != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(code)
<< " " << file << ":" << line << std::endl;
exit(code);
}
}

global void matMulKernel(float A, float B, float *C, int N) {

int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

if (row < N && col < N) {

float sum = 0.0f;
for (int k = 0; k < N; ++k) {
sum += A[row * N + k] * B[k * N + col];
}
C[row * N + col] = sum;
}
}

void matMul(float *A, float *B, float *C, int N, float &ms) {
float *d_A, *d_B, *d_C;
size_t size = N * N * sizeof(float);

CUDA_CHECK(cudaMalloc(&d_A, size));
CUDA_CHECK(cudaMalloc(&d_B, size));
CUDA_CHECK(cudaMalloc(&d_C, size));

CUDA_CHECK(cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice));

CUDA_CHECK(cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice));

dim3 threadsPerBlock(16, 16);

dim3 numBlocks((N + 15) / 16, (N + 15) / 16);

cudaEvent_t start, stop;

CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&stop));
CUDA_CHECK(cudaEventRecord(start));

matMulKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));

CUDA_CHECK(cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost));

CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_B));
CUDA_CHECK(cudaFree(d_C));
CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
}

int main(int argc, char **argv) {

std::ifstream inSize("matrix.size");
if (inSize.is_open() == false) {
std::cerr << "Unable to open input file!" << std::endl;
return 1;
}

int N;
inSize >> N;

const int size = N * N;

float *A = new float[size];
float *B = new float[size];
float *C = new float[size];

for (int i = 0; i < size; ++i) {

A[i] = static_cast<float>(rand()) / RAND_MAX;
B[i] = static_cast<float>(rand()) / RAND_MAX;
}

float ms = 0;
matMul(A, B, C, N, ms);

cudaError_t err = cudaGetLastError();

if (err != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl;
}

std::cout << "Naive: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;

std::ofstream out("naive_time.time"); // No append flag = overwrite

if (out.is_open()) {
out << "Naive: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;
out.close();
} else {
std::cerr << "Unable to open output file!" << std::endl;
}

std::cout << "Sample outputs: C[0][0] = " << C[0] << ", C[N/2][N/2] = "
<< C[(N/2) * N + (N/2)] << ", C[N-1][N-1] = " << C[(N-1)*N + (N-
1)] << std::endl;

delete[] A;
delete[] B;
delete[] C;
return 0;
}

Naive: N = 8192, kernel time = 2590.08 ms

Sample outputs: C[0][0] = 2052.07, C[N/2][N/2] = 2059.19, C[N-1][N-1] = 2070.76

Shared Memory Tiling

This kernel uses shared memory to reduce redundant loads from global
memory. Each block loads a tile of A and B into shared memory, computes
the partial result, then synchronizes before moving to the next tile.

%%cuda
#include <iostream>
#include <cstdlib>
#include <cuda_runtime.h>
#include <fstream>

#define TILE_SIZE 16

#define CUDA_CHECK(ans) { gpuAssert((ans), FILE, LINE); }

inline void gpuAssert(cudaError_t code, const char *file, int line) {

if (code != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(code)
<< " " << file << ":" << line << std::endl;
exit(code);
}
}

global void matMulKernel(float A, float B, float *C, int N) {

__shared__ float tileA[TILE_SIZE][TILE_SIZE];
__shared__ float tileB[TILE_SIZE][TILE_SIZE];

int row = blockIdx.y * TILE_SIZE + threadIdx.y;

int col = blockIdx.x * TILE_SIZE + threadIdx.x;

float val = 0.0;

for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; ++t) {

if (row < N && t * TILE_SIZE + threadIdx.x < N)
tileA[threadIdx.y][threadIdx.x] = A[row * N + t * TILE_SIZE +
threadIdx.x];
else
tileA[threadIdx.y][threadIdx.x] = 0;

if (t * TILE_SIZE + threadIdx.y < N && col < N)

tileB[threadIdx.y][threadIdx.x] = B[(t * TILE_SIZE + threadIdx.y) *
N + col];
else
tileB[threadIdx.y][threadIdx.x] = 0;

__syncthreads();

for (int i = 0; i < TILE_SIZE; ++i)

val += tileA[threadIdx.y][i] * tileB[i][threadIdx.x];

__syncthreads();
}

if (row < N && col < N)

C[row * N + col] = val;
}

void matMul(float *A, float *B, float *C, int N, float &ms) {
float *d_A, *d_B, *d_C;
size_t size = N * N * sizeof(float);

CUDA_CHECK(cudaMalloc(&d_A, size));
CUDA_CHECK(cudaMalloc(&d_B, size));
CUDA_CHECK(cudaMalloc(&d_C, size));

CUDA_CHECK(cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice));

CUDA_CHECK(cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice));

dim3 threadsPerBlock(16, 16);

dim3 numBlocks((N + 15) / 16, (N + 15) / 16);

cudaEvent_t start, stop;

CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&stop));
CUDA_CHECK(cudaEventRecord(start));
matMulKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));

CUDA_CHECK(cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost));

CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_B));
CUDA_CHECK(cudaFree(d_C));
CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
}

int main(int argc, char **argv) {

std::ifstream inSize("matrix.size");
if (inSize.is_open() == false) {
std::cerr << "Unable to open input file!" << std::endl;
return 1;
}

int N;
inSize >> N;

const int size = N * N;

float *A = new float[size];
float *B = new float[size];
float *C = new float[size];

for (int i = 0; i < size; ++i) {

A[i] = static_cast<float>(rand()) / RAND_MAX;
B[i] = static_cast<float>(rand()) / RAND_MAX;
}

float ms = 0;
matMul(A, B, C, N, ms);

cudaError_t err = cudaGetLastError();

if (err != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl;
}

std::cout << "Shared: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;

std::ofstream out("shared_time.time"); // No append flag = overwrite

if (out.is_open()) {
out << "Shared: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;
out.close();
} else {
std::cerr << "Unable to open output file!" << std::endl;
}

std::cout << "Sample outputs: C[0][0] = " << C[0] << ", C[N/2][N/2] = "
<< C[(N/2) * N + (N/2)] << ", C[N-1][N-1] = " << C[(N-1)*N + (N-
1)] << std::endl;

delete[] A;
delete[] B;
delete[] C;
return 0;
}

Shared: N = 8192, kernel time = 1690.28 ms

Sample outputs: C[0][0] = 2052.07, C[N/2][N/2] = 2059.19, C[N-1][N-1] = 2070.76

Instead of computing one output per thread, each thread computes a small
tile of outputs (using registers). This leverages fast register access.

%%cuda
#include <iostream>
#include <cstdlib>
#include <cuda_runtime.h>
#include <fstream>

#define TILE_SIZE 16

#define CUDA_CHECK(ans) { gpuAssert((ans), FILE, LINE); }

inline void gpuAssert(cudaError_t code, const char *file, int line) {

if (code != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(code)
<< " " << file << ":" << line << std::endl;
exit(code);
}
}

global void matMulKernel(const float* A, const float* B, float* C, int N) {

__shared__ float tileA[TILE_SIZE][TILE_SIZE];
__shared__ float tileB[TILE_SIZE][TILE_SIZE];

// Compute global row/col for 2x2 tile

int row = blockIdx.y * TILE_SIZE + threadIdx.y * 2;
int col = blockIdx.x * TILE_SIZE + threadIdx.x * 2;

float c00 = 0, c01 = 0, c10 = 0, c11 = 0;

for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; ++t) {

// Load 2 rows of A into shared memory
for (int i = 0; i < 2; ++i) {
int r = row + i;
int c = t * TILE_SIZE + threadIdx.x * 2;
if (r < N && c < N) tileA[threadIdx.y * 2 + i][threadIdx.x * 2]
= A[r * N + c];
else tileA[threadIdx.y * 2 + i][threadIdx.x * 2]
= 0.0f;
if (r < N && c + 1 < N) tileA[threadIdx.y * 2 + i][threadIdx.x * 2
+ 1] = A[r * N + c + 1];
else tileA[threadIdx.y * 2 + i][threadIdx.x * 2
+ 1] = 0.0f;
}
// Load 2 columns of B into shared memory
for (int i = 0; i < 2; ++i) {
int r = t * TILE_SIZE + threadIdx.y * 2;
int c = col + i;
if (r < N && c < N) tileB[threadIdx.y * 2][threadIdx.x * 2 + i]
= B[r * N + c];
else tileB[threadIdx.y * 2][threadIdx.x * 2 + i]
= 0.0f;
if (r + 1 < N && c < N) tileB[threadIdx.y * 2 + 1][threadIdx.x * 2
+ i] = B[(r + 1) * N + c];
else tileB[threadIdx.y * 2 + 1][threadIdx.x * 2
+ i] = 0.0f;
}

__syncthreads();

// Multiply shared memory tiles

for (int k = 0; k < TILE_SIZE; ++k) {
float a0 = tileA[threadIdx.y * 2][k];
float a1 = tileA[threadIdx.y * 2 + 1][k];
float b0 = tileB[k][threadIdx.x * 2];
float b1 = tileB[k][threadIdx.x * 2 + 1];

c00 += a0 * b0;
c01 += a0 * b1;
c10 += a1 * b0;
c11 += a1 * b1;
}

__syncthreads();
}

// Write back to global memory

if (row < N && col < N) {
C[row * N + col] = c00;
if (col + 1 < N) C[row * N + col + 1] = c01;
if (row + 1 < N) C[(row + 1) * N + col] = c10;
if (row + 1 < N && col + 1 < N) C[(row + 1) * N + col + 1] = c11;
}
}

void matMul(float *A, float *B, float *C, int N, float &ms) {
float *d_A, *d_B, *d_C;
size_t size = N * N * sizeof(float);

CUDA_CHECK(cudaMalloc(&d_A, size));
CUDA_CHECK(cudaMalloc(&d_B, size));
CUDA_CHECK(cudaMalloc(&d_C, size));

CUDA_CHECK(cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice));

CUDA_CHECK(cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice));

dim3 threadsPerBlock(TILE_SIZE / 2, TILE_SIZE / 2);

dim3 numBlocks((N + TILE_SIZE - 1) / TILE_SIZE, (N + TILE_SIZE - 1) /
TILE_SIZE);

cudaEvent_t start, stop;

CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&stop));
CUDA_CHECK(cudaEventRecord(start));

matMulKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));

CUDA_CHECK(cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost));

CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_B));
CUDA_CHECK(cudaFree(d_C));
CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
}

int main(int argc, char **argv) {

std::ifstream inSize("matrix.size");
if (inSize.is_open() == false) {
std::cerr << "Unable to open input file!" << std::endl;
return 1;
}

int N;
inSize >> N;

const int size = N * N;

float *A = new float[size];
float *B = new float[size];
float *C = new float[size];

for (int i = 0; i < size; ++i) {

A[i] = static_cast<float>(rand()) / RAND_MAX;
B[i] = static_cast<float>(rand()) / RAND_MAX;
}

float ms = 0;
matMul(A, B, C, N, ms);

cudaError_t err = cudaGetLastError();

if (err != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl;
}

std::cout << "Register: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;

std::ofstream out("register_time.time"); // No append flag = overwrite

if (out.is_open()) {
out << "Register: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;
out.close();
} else {
std::cerr << "Unable to open output file!" << std::endl;
}

std::cout << "Sample outputs: C[0][0] = " << C[0] << ", C[N/2][N/2] = "
<< C[(N/2) * N + (N/2)] << ", C[N-1][N-1] = " << C[(N-1)*N + (N-
1)] << std::endl;

delete[] A;
delete[] B;
delete[] C;
return 0;
}

Register: N = 8192, kernel time = 1625.12 ms

Sample outputs: C[0][0] = 2052.07, C[N/2][N/2] = 2059.19, C[N-1][N-1] = 2070.76

Benchmarking

import re
import matplotlib.pyplot as plt

# Files you want to parse

time_files = [
"naive_time.time",
"shared_time.time",
"register_time.time",
]

timings = []

# Extract label and time from each file

for file in time_files:
with open(file, 'r') as f:
content = f.read()
match = re.search(r"^\s*([A-Za-z0-9+ ]+):.*?kernel time\s*=\s*([0-
9.]+)", content)
if match:
label = match.group(1)
time = float(match.group(2))
timings.append((label, time))
else:
print(f"Warning: Couldn't parse {file}")

# Unzip
labels, times = zip(*timings)

# Plotting
plt.figure(figsize=(8, 5))
bars = plt.barh(labels, times)
plt.xlabel("Execution Time (ms)")
plt.title("GPU Matrix Multiplication Time Comparison")
plt.gca().invert_yaxis()

# Annotate bars
for bar, t in zip(bars, times):
plt.text(bar.get_width() + 0.3, bar.get_y() + bar.get_height() / 2,
f"{t:.2f} ms", va='center')

plt.grid(axis='x', linestyle='--', alpha=0.6)

plt.tight_layout()
plt.grid(True)
plt.show()

[]

Source Code
No ratings yet
Source Code
7 pages
BECOA157 Parallel Matrix Multiplication
No ratings yet
BECOA157 Parallel Matrix Multiplication
3 pages
p4 Multiply
No ratings yet
p4 Multiply
2 pages
HPC (Pra 04)
No ratings yet
HPC (Pra 04)
11 pages
Rishi
No ratings yet
Rishi
30 pages
CUDA Matrix Multiplication Quiz
No ratings yet
CUDA Matrix Multiplication Quiz
12 pages
Cuda4 2
No ratings yet
Cuda4 2
4 pages
Cuda 4.1
No ratings yet
Cuda 4.1
2 pages
PDC Assignment
No ratings yet
PDC Assignment
9 pages
CUDA MatrixMultiplication
No ratings yet
CUDA MatrixMultiplication
2 pages
PC Cuda Assignment-2
No ratings yet
PC Cuda Assignment-2
29 pages
Vector Addition
No ratings yet
Vector Addition
3 pages
CUDA Homework: Basic Exercises
No ratings yet
CUDA Homework: Basic Exercises
185 pages
LP 1,,1
No ratings yet
LP 1,,1
5 pages
Input: Output: 1. Sub String Program
No ratings yet
Input: Output: 1. Sub String Program
8 pages
OpenCL GPU Matrix Multiplication Code
No ratings yet
OpenCL GPU Matrix Multiplication Code
19 pages
CUDA Matrix-Vector Multiplication Code
No ratings yet
CUDA Matrix-Vector Multiplication Code
4 pages
Cuda Add Mult
No ratings yet
Cuda Add Mult
3 pages
Moving To Parallel - Addition of 2 Matrices
No ratings yet
Moving To Parallel - Addition of 2 Matrices
14 pages
Cuda
No ratings yet
Cuda
4 pages
5 Computation
No ratings yet
5 Computation
13 pages
Addition Cuda
No ratings yet
Addition Cuda
2 pages
Optimized CUDA Vector Addition Code
No ratings yet
Optimized CUDA Vector Addition Code
5 pages
CUDA Class Lecture03
No ratings yet
CUDA Class Lecture03
18 pages
2023 CSC14120 Lecture01 CUDAIntroduction
No ratings yet
2023 CSC14120 Lecture01 CUDAIntroduction
32 pages
Cuda
No ratings yet
Cuda
7 pages
Cuuda Nvidai Guide - Part3
No ratings yet
Cuuda Nvidai Guide - Part3
15 pages
Assignment 04
No ratings yet
Assignment 04
16 pages
G80 Cuda
No ratings yet
G80 Cuda
25 pages
Web GPU
0% (1)
Web GPU
40 pages
Introduction To CUDA: CAP 4730 Spring 2012
No ratings yet
Introduction To CUDA: CAP 4730 Spring 2012
35 pages
CUDA Matrix Multiplication Techniques
100% (1)
CUDA Matrix Multiplication Techniques
55 pages
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
No ratings yet
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
45 pages
HPC File
No ratings yet
HPC File
22 pages
ECE408 S19 ZJUI Exam1 Study Guide
No ratings yet
ECE408 S19 ZJUI Exam1 Study Guide
25 pages
Lab Report 6
No ratings yet
Lab Report 6
12 pages
Multithreaded Architectures: Memory and Data Locality
No ratings yet
Multithreaded Architectures: Memory and Data Locality
39 pages
Efficient CUDA Programming Techniques
No ratings yet
Efficient CUDA Programming Techniques
3 pages
CPU vs GPU Matrix Multiplication Code
No ratings yet
CPU vs GPU Matrix Multiplication Code
5 pages
Matrix-Matrix Multiplication Using Shared Memory
No ratings yet
Matrix-Matrix Multiplication Using Shared Memory
27 pages
DeviceFunc Cu
100% (1)
DeviceFunc Cu
1 page
SYCL Bfloat16 Matrix Multiplication
No ratings yet
SYCL Bfloat16 Matrix Multiplication
4 pages
Processors
No ratings yet
Processors
25 pages
CUDA Matrix Multiplication Example
No ratings yet
CUDA Matrix Multiplication Example
3 pages
217 Lec3
No ratings yet
217 Lec3
46 pages
CUDA Additionof2Vector
No ratings yet
CUDA Additionof2Vector
2 pages
Kernel Cu
100% (1)
Kernel Cu
1 page
L06 GPGPU CUDA Programming 1
No ratings yet
L06 GPGPU CUDA Programming 1
23 pages
Developing Kernels: Part 2: Algorithm Considerations, Multi-Kernel Programs and Optimization
No ratings yet
Developing Kernels: Part 2: Algorithm Considerations, Multi-Kernel Programs and Optimization
23 pages
High Performance Computing Test II
No ratings yet
High Performance Computing Test II
10 pages
Class 10
No ratings yet
Class 10
13 pages
OpenCL Performance Measurement Analysis
No ratings yet
OpenCL Performance Measurement Analysis
3 pages
217 Lec2
No ratings yet
217 Lec2
24 pages
HW 2
No ratings yet
HW 2
12 pages
Allocate The Device Memory Where We Will Copy M
No ratings yet
Allocate The Device Memory Where We Will Copy M
2 pages
Tiling Techniques for GPU Performance
No ratings yet
Tiling Techniques for GPU Performance
23 pages
CUDA Part-2
No ratings yet
CUDA Part-2
49 pages
3 Cuda
No ratings yet
3 Cuda
5 pages
Lab 6
No ratings yet
Lab 6
29 pages
Ece265p Fahmy Day7
No ratings yet
Ece265p Fahmy Day7
93 pages
Chapter 12
No ratings yet
Chapter 12
61 pages
HW4ML Project Code
No ratings yet
HW4ML Project Code
24 pages
Distributed Database Concepts
No ratings yet
Distributed Database Concepts
35 pages
XML: Extensible Markup Language
No ratings yet
XML: Extensible Markup Language
35 pages
Chapter19 v2
No ratings yet
Chapter19 v2
54 pages
Introduction To Systems Analysis and Design:: An Agile, Iterative Approach
No ratings yet
Introduction To Systems Analysis and Design:: An Agile, Iterative Approach
49 pages
Multichannel CRM Integration Guide
No ratings yet
Multichannel CRM Integration Guide
9 pages
Introduction To Systems Analysis and Design:: An Agile, Iterative Approach
No ratings yet
Introduction To Systems Analysis and Design:: An Agile, Iterative Approach
39 pages
Mosfet Cross Reference Guide - Fairchild
No ratings yet
Mosfet Cross Reference Guide - Fairchild
7 pages
Sales Pipeline and Business Forecast
No ratings yet
Sales Pipeline and Business Forecast
27 pages
Soil-Cement Seminar Report 2024-25
No ratings yet
Soil-Cement Seminar Report 2024-25
43 pages
Bad Weather Ship Maneuvers Guide
No ratings yet
Bad Weather Ship Maneuvers Guide
6 pages
Kintex-7 FPGA KC705 Base Targeted Reference Design: User Guide
No ratings yet
Kintex-7 FPGA KC705 Base Targeted Reference Design: User Guide
94 pages
MSC Botony Sem 1 2021
No ratings yet
MSC Botony Sem 1 2021
25 pages
SCENA LED 150 User Manual
No ratings yet
SCENA LED 150 User Manual
20 pages
Compal LA-6062P M/B Engineering Drawings
No ratings yet
Compal LA-6062P M/B Engineering Drawings
59 pages
5 Ci Sinif Word Definition-4-2025
No ratings yet
5 Ci Sinif Word Definition-4-2025
2 pages
Malaria: History and Life Cycle Overview
100% (9)
Malaria: History and Life Cycle Overview
102 pages
Jetset Intermediate Jet Version Rebrand
No ratings yet
Jetset Intermediate Jet Version Rebrand
112 pages
Modern Report Powerpoint Template: Free Template Site2Max - Pro
No ratings yet
Modern Report Powerpoint Template: Free Template Site2Max - Pro
19 pages
Overview of the Respiratory System
No ratings yet
Overview of the Respiratory System
1 page
Unani Medicine Expert's Profile
100% (2)
Unani Medicine Expert's Profile
7 pages
Freelancers' Guide to Effective DMs
100% (1)
Freelancers' Guide to Effective DMs
4 pages
Sustainable Tourism
100% (1)
Sustainable Tourism
14 pages
Advertising Effectiveness: New Model Insights
No ratings yet
Advertising Effectiveness: New Model Insights
7 pages
Chapter 3 - Legal Basis of Gender and Society
100% (2)
Chapter 3 - Legal Basis of Gender and Society
12 pages
C11000 Copper Specification Sheet
No ratings yet
C11000 Copper Specification Sheet
8 pages
Question Paper - Unit 2 - June 2022
No ratings yet
Question Paper - Unit 2 - June 2022
20 pages
Cot 1-Lesson Plan in Mapeh 7
No ratings yet
Cot 1-Lesson Plan in Mapeh 7
21 pages
Chemist Retail Business Guide
No ratings yet
Chemist Retail Business Guide
31 pages
DIY Ram Paper Sculpture Kit
No ratings yet
DIY Ram Paper Sculpture Kit
20 pages
Class 9 Notes 2025
No ratings yet
Class 9 Notes 2025
1 page
Overview of Renewable Energy Sources
No ratings yet
Overview of Renewable Energy Sources
11 pages
Laila Cohen Non Resume 2016
No ratings yet
Laila Cohen Non Resume 2016
1 page
Chemistry Form 3 Term 1
No ratings yet
Chemistry Form 3 Term 1
10 pages
Nashtik Darshan - 3 (Autosaved)
No ratings yet
Nashtik Darshan - 3 (Autosaved)
34 pages
Understanding Tax Evasion in Ethiopia
No ratings yet
Understanding Tax Evasion in Ethiopia
12 pages
New Client Copy Tool 1735966670
No ratings yet
New Client Copy Tool 1735966670
15 pages

Lab7 GPU

Uploaded by

Lab7 GPU

Uploaded by

Matrix Multiplication on GPU: Optimizations and Insights

In this notebook, we explore GPU implementations of matrix

Preparing the required packages and initialisations

pip install nvcc4jupyter

Detected platform "Colab". Running its setup...

from nvcc4jupyter import set_defaults

## Setting the matrix size

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line) {

__global__ void matMulKernel(float *A, float *B, float *C, int N) {

if (row < N && col < N) {

CUDA_CHECK(cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice));

dim3 threadsPerBlock(16, 16);

cudaEvent_t start, stop;

matMulKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

CUDA_CHECK(cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost));

int main(int argc, char **argv) {

const int size = N * N;

for (int i = 0; i < size; ++i) {

cudaError_t err = cudaGetLastError();

std::ofstream out("naive_time.time"); // No append flag = overwrite

Naive: N = 8192, kernel time = 2590.08 ms

Shared Memory Tiling

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line) {

__global__ void matMulKernel(float *A, float *B, float *C, int N) {

int row = blockIdx.y * TILE_SIZE + threadIdx.y;

float val = 0.0;

for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; ++t) {

if (t * TILE_SIZE + threadIdx.y < N && col < N)

for (int i = 0; i < TILE_SIZE; ++i)

if (row < N && col < N)

CUDA_CHECK(cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice));

dim3 threadsPerBlock(16, 16);

cudaEvent_t start, stop;

CUDA_CHECK(cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost));

int main(int argc, char **argv) {

const int size = N * N;

for (int i = 0; i < size; ++i) {

cudaError_t err = cudaGetLastError();

std::ofstream out("shared_time.time"); // No append flag = overwrite

Shared: N = 8192, kernel time = 1690.28 ms

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line) {

__global__ void matMulKernel(const float* A, const float* B, float* C, int N) {

// Compute global row/col for 2x2 tile

float c00 = 0, c01 = 0, c10 = 0, c11 = 0;

for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; ++t) {

// Multiply shared memory tiles

// Write back to global memory

CUDA_CHECK(cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice));

dim3 threadsPerBlock(TILE_SIZE / 2, TILE_SIZE / 2);

cudaEvent_t start, stop;

matMulKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

CUDA_CHECK(cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost));

int main(int argc, char **argv) {

const int size = N * N;

for (int i = 0; i < size; ++i) {

cudaError_t err = cudaGetLastError();

std::ofstream out("register_time.time"); // No append flag = overwrite

Register: N = 8192, kernel time = 1625.12 ms

# Files you want to parse

# Extract label and time from each file

plt.grid(axis='x', linestyle='--', alpha=0.6)

You might also like

#define CUDA_CHECK(ans) { gpuAssert((ans), FILE, LINE); }

global void matMulKernel(float A, float B, float *C, int N) {

#define CUDA_CHECK(ans) { gpuAssert((ans), FILE, LINE); }

global void matMulKernel(float A, float B, float *C, int N) {

#define CUDA_CHECK(ans) { gpuAssert((ans), FILE, LINE); }

global void matMulKernel(const float* A, const float* B, float* C, int N) {