CUDA toolkit

CUDA basic usage

CUDA Version 10 is installed on VISION, with thrust (c++ library which mimic stl behavior : https://thrust.github.io/). Each graphic card can be use to make CUDA computation.
Only one GPU card by job can be selected, due to queue policy.

#include <cstdlib>
#include <iostream>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <algorithm>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/fill.h>
#include <thrust/replace.h>
#include <thrust/functional.h>
#include <thrust/version.h>

using namespace std;
using namespace thrust;

// function to print information on devices
void get_gpus_info()
{
    int major = THRUST_MAJOR_VERSION;
    int minor = THRUST_MINOR_VERSION;
    const int kb = 1024;
    const int mb = kb * kb;
    wcout << "NBody.GPU" << endl << "=========" << endl << endl;

    wcout << "CUDA version:   v" << CUDART_VERSION << endl;
    cout << "Thrust v" << major << "." << minor << endl;
    int devCount;
    cudaGetDeviceCount(&devCount);
    wcout << "CUDA Devices: " << devCount << endl << endl;

    for(int i = 0; i < devCount; ++i)
    {
        cudaDeviceProp props;
        cudaGetDeviceProperties(&props, i);
        wcout << i << ": " << props.name << ": " << props.major << "." << props.minor << endl;
        wcout << "  Global memory:   " << props.totalGlobalMem / mb << "mb" << endl;
        wcout << "  Shared memory:   " << props.sharedMemPerBlock / kb << "kb" << endl;
        wcout << "  Constant memory: " << props.totalConstMem / kb << "kb" << endl;
        wcout << "  Block registers: " << props.regsPerBlock << endl << endl;

        wcout << "  Warp size:         " << props.warpSize << endl;
        wcout << "  Threads per block: " << props.maxThreadsPerBlock << endl;
        wcout << "  Max block dimensions: [ " << props.maxThreadsDim[0] << ", " << props.maxThreadsDim[1]  << ", " << props.maxThreadsDim[2] << " ]" << endl;
        wcout << "  Max grid dimensions:  [ " << props.maxGridSize[0] << ", " << props.maxGridSize[1]  << ", " << props.maxGridSize[2] << " ]" << endl;
        wcout << endl;
    }

}

// 
int main(void)
{
    //get_gpus_info();

    //cudaSetDevice(3); // it's possible to select specific device

    // generate 32M random numbers serially
    thrust::host_vector<int> h_vec(32 << 20);
    thrust::generate(h_vec.begin(), h_vec.end(), rand);
    cout << "size HOST vector " << h_vec.size() << endl;
    // transfer data to the device
    thrust::device_vector<int> d_vec = h_vec;

    // sort data on the device
    thrust::sort(d_vec.begin(), d_vec.end());

    // add all components from device vector and store the result inside x
    int x = thrust::reduce(d_vec.begin(), d_vec.end(), 0, thrust::plus<int>());

    // copy from device to host
    thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
    std::cout << "x = " << x << std::endl;
    return 0;
}
  • To compile this program, use nvcc compiler :
[homer@vision cuda_test]$ module add cuda/10.1
[homer@vision cuda_test]$ nvcc -o thrust_test thrust.cu
  • To launch the created executable correctly, you must use the gpuq queue. The selected device is defined according queue policy :
[homer@vision cuda_test]$ cat thrust_test.pbs
#!/bin/bash
#PBS -N THRUST
#PBS -l select=1:ngpus=1
#PBS -l walltime=0:10:00
#PBS -j oe
#PBS -q gpuq

cd ${PBS_O_WORKDIR}

module purge
module add cuda/10.1

./thrust_test
  • Run it :

[homer@vision cuda_test]$ qsub thrust_test.pbs
5682.vision

[homer@vision cuda_test]$ more THRUST.o5682
NBody.GPU
=========

CUDA version:   v10010
CUDA Devices: 1

0: Tesla V100-PCIE-32GB: 7.0
  Global memory:   32510mb
  Shared memory:   48kb
  Constant memory: 64kb
  Block registers: 65536

  Warp size:         32
  Threads per block: 1024
  Max block dimensions: [ 1024, 1024, 64 ]
  Max grid dimensions:  [ 2147483647, 65535, 65535 ]

size HOST vector 33554432
x = 848185404

Transfer cpu computation (BLAS) to gpu

As a part of CUDA NVIDIA has prepared a drop-in version of BLAS library i.e. nvblas. It uses cuBLAS and its back-end library. However, the memory management is fully hidden inside the "standard" BLAS routines. This brings the ability to effectively use cuBLAS in all programs that call BLAS routines via its standard interface. More than that, there is no need to recompile the program since the libnvblas.so library can be preloaded using the LD_PRELOAD variable.
nvblas supports only BLAS level 3 operations and the free license allows you to use 1 NVIDIA board per node.

One configuration file nvblas.conf is needed and must be placed in the working directory or the path to it has to be defined with NVBLAS_CONFIG_FILE environmental variable.

NVBLAS_LOGFILE  nvblas.log
NVBLAS_CPU_BLAS_LIB  /the/path/to/cpu/lib/libblas.so
NVBLAS_GPU_LIST ALL
NVBLAS_TILE_DIM 2048
NVBLAS_AUTOPIN_MEM_ENABLED

Consider simple python file test.py

import time
import numpy as np
n=10**4
t=time.time()
A = np.random.random((n,n))
B = np.random.random((n,n))
C = A.dot(B)
print("Done in {}".format(time.time()-t)))

You need to access the gpuq queue in PBS, which load the GPU cards.
The following command choose gpuq queue in an interactive session, and select one gpu for 10 minutes :

 qsub -I -l select=1:ngpus=1 -q gpuq -l walltime=0:10:00 

you would run this file with the following command, and see which blas library is used from numpy code.

[homer@vision cuda_test]$ module add python/py3.8
[homer@vision cuda_test]$ python -m threadpoolctl -i numpy
[
  {
    "filepath": "/zfs/softs/python/3.8.6/lib/python3.8/site-packages/numpy.libs/libopenblasp-r0-34a18dc3.3.7.so",
    "prefix": "libopenblas",
    "user_api": "blas",
    "internal_api": "openblas",
    "version": "0.3.7",
    "num_threads": 64,
    "threading_layer": "pthreads" 
  }
]
[homer@vision cuda_test]$ python ./test.py
Done in 4.1173996925354

Those code use internal blas library from numpy. If we want to use nvidia blas, you need to define a file nvblas.conf in your work directory (as seen below), with NVBLAS_CPU_BLAS_LIB pointing to the classic blas library, then use LD_PRELOAD variable before running your code (in this case, you need to add path to an additionnal dependency for "classic blas library) :

[homer@vision cuda_test]$ cat nvblas.conf
NVBLAS_LOGFILE  nvblas.log
NVBLAS_CPU_BLAS_LIB /zfs/softs/python/3.8.6/lib/python3.8/site-packages/numpy.libs/libopenblasp-r0-34a18dc3.3.7.so
NVBLAS_GPU_LIST ALL
NVBLAS_TILE_DIM 2048
NVBLAS_AUTOPIN_MEM_ENABLED
[homer@vision cuda_test]$ LD_LIBRARY_PATH=/zfs/softs/python/3.8.6/lib/python3.8/site-packages/numpy.libs:$LD_LIBRARY_PATH
[homer@vision cuda_test]$ LD_PRELOAD="/usr/lib64/libnvblas.so.10" python ./test.py

this will offload the calculations to the GPU. You may check its status, memory usage and the cores utilization using nvidia-smi -l command. libnvblas.so will try to load other libraries so your LD_LIBRARY_PATH variable should have the path to your CUDA libraries.