!pip install nvcc4jupyter

Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: nvcc4jupyter in /home/jarlold/.sage/local/lib/python3.10/site-packages (1.2.1)

%load_ext nvcc4jupyter

Source files will be saved in "/tmp/tmp63l0jrw0".

%%cuda
#include <iostream>
int main() {
    std::cout << "Cuda is setup right!\n";
    return 0;
}

Cuda is setup right!

%%cuda
#include <stdio.h>

__global__ void operate(int* x, int size_x) {
    //# We'll just cube every number in the chunk
    for (int i =0; i < size_x; i++) {
        x[i] = x[i] * x[i] * x[i];
    }
}

int main(void) {
    //# Create an array of integer (2^20)
    long int N = 1<<20;
    int *device_x, *host_x;
    cudaMalloc(&device_x, N*sizeof(int));
    host_x = (int*)malloc(N*sizeof(int));

    //# Initialize the host x as all 2s
    for (int i = 0; i < N; i++) {
        host_x[i] = 2;
    }

    //# Then copy it over to the device list
    cudaMemcpy(device_x, host_x, N*sizeof(int), cudaMemcpyHostToDevice);

    //# Run our cool function on them
    operate<<<1, 1>>>(device_x, N);

    //# Then copy them over again
    cudaMemcpy(host_x, device_x, N*sizeof(int), cudaMemcpyDeviceToHost);

    //# And we'll just print the first one since the results will all be the same
    printf("Value is %d\n", host_x[0]);

    //# We can now deallocate it 
    cudaFree(device_x);
    free(host_x);
}

Value is 8

%%cuda
#include <stdio.h>

__device__ int do_assembly(unsigned int x) {
    //# We'll allocate a normal integer Y to be our output register
    int y;

    //# Then this code which uses mul.lo.u32 (twice since we're cubing things)
    asm(
        //# y = x * x
        " mul.lo.u32 %0, %1, %1;"
        //# y = y * x
        " mul.lo.u32 %0, %0, %1;"        
        //# This specifies that %0 is our output register, and that it should be bound to y.
        //# and that %1 is our input register, and it should be bound to x.
        //# (if we had another input it would be %2)
        : "=r"(y) : "r" (x));     
    return y;
}

__global__ void operate(int* x, int size_x) {
    //# We'll just cube every number in the chunk
    for (int i =0; i < size_x; i++) {
        //# Then in this example we'll just call our assembly code on it
        x[i] = do_assembly(x[i]);
    }
}

int main(void) {
    //# Create an array of integer (2^20)
    long int N = 1<<20;
    int *device_x, *host_x;
    cudaMalloc(&device_x, N*sizeof(int));
    host_x = (int*)malloc(N*sizeof(int));

    //# Initialize the host x as all 2s
    for (int i = 0; i < N; i++) {
        host_x[i] = 2;
    }

    //# Then copy it over to the device list
    cudaMemcpy(device_x, host_x, N*sizeof(int), cudaMemcpyHostToDevice);

    //# Run our cool function on them, timing it of course
    printf("Time is %lu\n", time(NULL));
    operate<<<1, 1>>>(device_x, N);
    printf("Time is %lu\n", time(NULL));

    //# Then copy them over again
    cudaMemcpy(host_x, device_x, N*sizeof(int), cudaMemcpyDeviceToHost);

    //# And we'll just print the first one since the results will all be the same
    printf("Value is %d\n", host_x[0]);

    //# We can now deallocate it 
    cudaFree(device_x);
    free(host_x);
}

Time is 1728776155
Time is 1728776155
Value is 8

%%cuda
#include <stdio.h>

__device__ int do_assembly(unsigned int x) {
    int y;
    asm("{"
        ".reg .u32 t1;"
        " mul.lo.u32 t1, %1, %1;"
        " mov.u32 %0, t1;"
        "}"
        : "=r"(y) : "r" (x));     
    return y;
}

__global__ void operate(int* x, int size_x) {
    //# We'll just cube every number in the chunk
    for (int i =0; i < size_x; i++) {
        //# Then in this example we'll just call our assembly code on it
        x[i] = do_assembly(x[i]);
    }
}

int main(void) {
    //# Create an array of integer (2^20)
    long int N = 1<<20;
    int *device_x, *host_x;
    cudaMalloc(&device_x, N*sizeof(int));
    host_x = (int*)malloc(N*sizeof(int));

    //# Initialize the host x as all 2s
    for (int i = 0; i < N; i++) {
        host_x[i] = 2;
    }

    //# Then copy it over to the device list
    cudaMemcpy(device_x, host_x, N*sizeof(int), cudaMemcpyHostToDevice);

    //# Run our cool function on them
    operate<<<1, 1>>>(device_x, N);

    //# Then copy them over again
    cudaMemcpy(host_x, device_x, N*sizeof(int), cudaMemcpyDeviceToHost);

    //# And we'll just print the first one since the results will all be the same
    printf("Value is %d\n", host_x[0]);

    //# We can now deallocate it 
    cudaFree(device_x);
    free(host_x);
}

Time is 1728932039
Time is 1728932039
Value is 4

%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void inner_product(float *A, float *B, float *C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if(row < N && col < N) {
        float sum = 0;
        for(int k = 0; k < N; ++k) sum += A[row * N + k] * B[k * N + col];
        C[row * N + col] = sum;
    }
}

void print_matrix(float* m, int n) {
    for (int i = 0; i < n*n; i++) {
        if (i % n == 0) printf("\n");
        printf("%06.2f ", m[i]);

    }
}

int main() {
    int N = 8;
    float *A, *B, *C;
    float *d_A, *d_B, *d_C;

    //# Make 3 matrices of size N*N
    A = (float *)malloc(N * N * sizeof(float));
    B = (float *)malloc(N * N * sizeof(float));
    C = (float *)malloc(N * N * sizeof(float));

    //# Then initialize all their values to random floating points (except C of course)
    for (int i = 0; i < N*N; i++) {
        A[i] = random()/(float)random();
        B[i] = random()/(float)random();
    }

    //# Allocate space for them on the GPU
    cudaMalloc((void **)&d_A, N * N * sizeof(float));
    cudaMalloc((void **)&d_B, N * N * sizeof(float));
    cudaMalloc((void **)&d_C, N * N * sizeof(float));

    //# Then copy them over to the GPU
    cudaMemcpy(d_A, A, N * N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, N * N * sizeof(float), cudaMemcpyHostToDevice);

    //# Then finally we can call the matrix multiplication
    dim3 threadsPerBlock(N, N);
    dim3 blocksPerGrid(N / threadsPerBlock.x, N / threadsPerBlock.y);
    inner_product<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);

    //# Then copy the answer back
    cudaMemcpy(C, d_C, N * N * sizeof(float), cudaMemcpyDeviceToHost);

    //# And print the result
    print_matrix(C, N);

    //# And free up all the memory we allocated
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(A);
    free(B);
    free(C);

    return 0;
}

038.71 020.86 622.17 025.69 038.73 083.74 048.12 012.90 
005.49 017.14 043.33 008.36 013.66 013.23 009.11 005.72 
012.90 028.07 136.42 014.53 015.16 021.71 037.36 007.02 
014.61 018.56 163.75 016.92 022.81 027.45 033.80 010.13 
008.39 010.43 057.83 008.36 019.13 014.72 025.88 005.55 
018.82 264.59 082.10 043.10 059.06 038.45 070.76 019.03 
021.69 044.17 088.10 019.55 027.25 024.69 090.19 014.09 
017.12 039.01 302.73 023.22 015.97 038.42 021.50 007.87

%%cuda
#include <stdio.h>
__global__ void operate(int* x, int size_x) {
    //# We'll just cube every number in the chunk
    for (int i =0; i < size_x; i++) {
        x[i] = x[i] * x[i] * x[i];
    }
}

int main(void) {
    //# Create an the length of THE MAX VALUE OF AN UNSIGNED LONG INT
    unsigned long int N = 4294967295;
    int *device_x, *host_x;
    cudaMalloc(&device_x, N*sizeof(int));
    host_x = (int*)malloc(N*sizeof(int));

    //# Create some CUDA events so we can keep track of time
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    //# Initialize the host x as all 2s
    for (int i = 0; i < N; i++) {
        host_x[i] = 2;
    }

    //# Then copy it over to the device list
    cudaMemcpy(device_x, host_x, N*sizeof(int), cudaMemcpyHostToDevice);

    //# Record the start time
    cudaEventRecord(start);

    //# Run our cool function
    operate<<<1, 1>>>(device_x, N);

    //# Record the stop time
    cudaEventRecord(stop);

    //# This function stops the CPU execution until the "stop" event is recorded
    cudaEventSynchronize(stop);

    //# Then we'll compute the elapsed time using another CUDA function
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Operation completed in %fms\n", milliseconds);
    
    //# Then copy them over again
    cudaMemcpy(host_x, device_x, N*sizeof(int), cudaMemcpyDeviceToHost);

    //# And we'll just print the first one since the results will all be the same
    printf("Value is %d\n", host_x[0]);

    //# We can now deallocate it 
    cudaFree(device_x);
    free(host_x);
}

Operation completed in 0.006144ms
Value is 2

Nvidia CUDA in Jupyter¶

What is CUDA?¶

Setting It Up¶

Host vs Device Code¶

Cookbook¶

Hello World¶

Assembly Code (PTX)¶

Temporary Registers in PTX¶

Matrix Multiplication¶

Timing Functions¶

Nvidia CUDA in Jupyter¶

What is CUDA?¶

Setting It Up¶

Host vs Device Code¶

Cookbook¶

Hello World¶

Sharing Memory¶

Assembly Code (PTX)¶

Temporary Registers in PTX¶

Matrix Multiplication¶

Timing Functions¶