da/da6/_full_mat_c_u_d_a_8hpp_source.html

 /*******************************************************************************

  * Copyright (C) 2017-2023 Theodore Chang

  *

  * This program is free software: you can redistribute it and/or modify

  * it under the terms of the GNU General Public License as published by

  * the Free Software Foundation, either version 3 of the License, or

  * (at your option) any later version.

  *

  * This program is distributed in the hope that it will be useful,

  * but WITHOUT ANY WARRANTY; without even the implied warranty of

  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

  * GNU General Public License for more details.

  *

  * You should have received a copy of the GNU General Public License

  * along with this program.  If not, see <http://www.gnu.org/licenses/>.

  ******************************************************************************/

 // ReSharper disable CppCStyleCast

 #ifndef FULLMATCUDA_HPP

 #define FULLMATCUDA_HPP


 #ifdef SUANPAN_CUDA


 #include <cuda_runtime.h>

 #include <cusolverDn.h>

 #include "FullMat.hpp"


 template<sp_d T> class FullMatCUDA final : public FullMat<T> {

     cusolverDnHandle_t handle = nullptr;

     cudaStream_t stream = nullptr;


     int* info = nullptr;

     int* ipiv = nullptr;

     void* d_A = nullptr;

     void* buffer = nullptr;


     void acquire();

     void release() const;


 public:

     FullMatCUDA(uword, uword);

     FullMatCUDA(const FullMatCUDA&);

     FullMatCUDA(FullMatCUDA&&) noexcept = delete;

     FullMatCUDA& operator=(const FullMatCUDA&) = delete;

     FullMatCUDA& operator=(FullMatCUDA&&) noexcept = delete;

     ~FullMatCUDA() override;


     unique_ptr<MetaMat<T>> make_copy() override;


     int direct_solve(Mat<T>&, Mat<T>&&) override;

     int direct_solve(Mat<T>&, const Mat<T>&) override;

 };


 template<sp_d T> void FullMatCUDA<T>::acquire() {

     cusolverDnCreate(&handle);

     cudaStreamCreate(&stream);

     cusolverDnSetStream(handle, stream);


     cudaMalloc(&info, sizeof(int));

     cudaMemset(info, 0, sizeof(int));

     cudaMalloc(&ipiv, sizeof(int) * this->n_rows);


     if(int bufferSize = 0; std::is_same_v<T, float> || Precision::MIXED == this->setting.precision) {

         cudaMalloc(&d_A, sizeof(float) * this->n_elem);

         cusolverDnSgetrf_bufferSize(handle, int(this->n_rows), int(this->n_cols), (float*)d_A, int(this->n_elem), &bufferSize);

         cudaMalloc(&buffer, sizeof(float) * bufferSize);

     }

     else {

         cudaMalloc(&d_A, sizeof(double) * this->n_elem);

         cusolverDnDgetrf_bufferSize(handle, int(this->n_rows), int(this->n_cols), (double*)d_A, int(this->n_elem), &bufferSize);

         cudaMalloc(&buffer, sizeof(double) * bufferSize);

     }

 }


 template<sp_d T> void FullMatCUDA<T>::release() const {

     if(handle) cusolverDnDestroy(handle);

     if(stream) cudaStreamDestroy(stream);


     if(info) cudaFree(info);

     if(d_A) cudaFree(d_A);

     if(buffer) cudaFree(buffer);

     if(ipiv) cudaFree(ipiv);

 }


 template<sp_d T> FullMatCUDA<T>::FullMatCUDA(const uword in_rows, const uword in_cols)

     : FullMat<T>(in_rows, in_cols) { acquire(); }


 template<sp_d T> FullMatCUDA<T>::FullMatCUDA(const FullMatCUDA& other)

     : FullMat<T>(other) { acquire(); }


 template<sp_d T> FullMatCUDA<T>::~FullMatCUDA() { release(); }


 template<sp_d T> unique_ptr<MetaMat<T>> FullMatCUDA<T>::make_copy() { return make_unique<FullMatCUDA<T>>(*this); }


 template<sp_d T> int FullMatCUDA<T>::direct_solve(Mat<T>& X, Mat<T>&& B) { return direct_solve(X, B); }


 template<sp_d T> int FullMatCUDA<T>::direct_solve(Mat<T>& X, const Mat<T>& B) {

     if(std::is_same_v<T, float>) {

         // pure float

         if(!this->factored) {

             cudaMemcpyAsync(d_A, this->memptr(), sizeof(float) * this->n_elem, cudaMemcpyHostToDevice, stream);

             cusolverDnSgetrf(handle, int(this->n_rows), int(this->n_cols), (float*)d_A, int(this->n_rows), (float*)buffer, ipiv, info);


             this->factored = true;

         }


         const size_t byte_size = sizeof(float) * B.n_elem;


         void* d_x = nullptr;

         cudaMalloc(&d_x, byte_size);

         cudaMemcpyAsync(d_x, B.memptr(), byte_size, cudaMemcpyHostToDevice, stream);

         cusolverDnSgetrs(handle, CUBLAS_OP_N, int(this->n_rows), int(B.n_cols), (float*)d_A, int(this->n_rows), ipiv, (float*)d_x, int(this->n_rows), info);


         X.set_size(arma::size(B));


         cudaMemcpyAsync(X.memptr(), d_x, byte_size, cudaMemcpyDeviceToHost, stream);


         cudaDeviceSynchronize();


         if(d_x) cudaFree(d_x);

     }

     else if(Precision::MIXED == this->setting.precision) {

         // mixed precision

         if(!this->factored) {

             this->s_memory = this->to_float();


             cudaMemcpyAsync(d_A, this->s_memory.memptr(), sizeof(float) * this->s_memory.n_elem, cudaMemcpyHostToDevice, stream);

             cusolverDnSgetrf(handle, int(this->n_rows), int(this->n_cols), (float*)d_A, int(this->n_rows), (float*)buffer, ipiv, info);


             this->factored = true;

         }


         const size_t byte_size = sizeof(float) * B.n_elem;


         void* d_x = nullptr;

         cudaMalloc(&d_x, byte_size);


         X = arma::zeros(B.n_rows, B.n_cols);


         mat full_residual = B;


         auto multiplier = norm(full_residual);


         auto counter = 0u;

         while(counter++ < this->setting.iterative_refinement) {

             if(multiplier < this->setting.tolerance) break;


             auto residual = conv_to<fmat>::from(full_residual / multiplier);


             cudaMemcpyAsync(d_x, residual.memptr(), byte_size, cudaMemcpyHostToDevice, stream);

             cusolverDnSgetrs(handle, CUBLAS_OP_N, int(this->n_rows), int(B.n_cols), (float*)d_A, int(this->n_rows), ipiv, (float*)d_x, int(this->n_rows), info);

             cudaMemcpyAsync(residual.memptr(), d_x, byte_size, cudaMemcpyDeviceToHost, stream);


             cudaDeviceSynchronize();


             const mat incre = multiplier * conv_to<mat>::from(residual);


             X += incre;


             suanpan_debug("Mixed precision algorithm multiplier: {:.5E}.\n", multiplier = arma::norm(full_residual -= this->operator*(incre)));

         }


         if(d_x) cudaFree(d_x);

     }

     else {

         // pure double

         if(!this->factored) {

             cudaMemcpyAsync(d_A, this->memptr(), sizeof(double) * this->n_elem, cudaMemcpyHostToDevice, stream);

             cusolverDnDgetrf(handle, int(this->n_rows), int(this->n_cols), (double*)d_A, int(this->n_rows), (double*)buffer, ipiv, info);


             this->factored = true;

         }


         const size_t byte_size = sizeof(double) * B.n_elem;


         void* d_x = nullptr;

         cudaMalloc(&d_x, byte_size);

         cudaMemcpyAsync(d_x, B.memptr(), byte_size, cudaMemcpyHostToDevice, stream);

         cusolverDnDgetrs(handle, CUBLAS_OP_N, int(this->n_rows), int(B.n_cols), (double*)d_A, int(this->n_rows), ipiv, (double*)d_x, int(this->n_rows), info);


         X.set_size(arma::size(B));


         cudaMemcpyAsync(X.memptr(), d_x, byte_size, cudaMemcpyDeviceToHost, stream);


         cudaDeviceSynchronize();


         if(d_x) cudaFree(d_x);

     }


     return 0;

 }


 #endif


 #endif


FullMat.hpp

Precision::MIXED
@ MIXED

FullMatCUDA
A FullMatCUDA class that holds matrices.

FullMat
A FullMat class that holds matrices.
Definition: FullMat.hpp:35

MetaMat
A MetaMat class that holds matrices.
Definition: MetaMat.hpp:39

DOF::T
@ T

suanpan::make_copy
unique_ptr< Material > make_copy(const shared_ptr< Material > &)
Definition: Material.cpp:357

suanpan::info
void info(const std::string_view format_str, const T &... args)
Definition: suanPan.h:237

tensor::strain::norm
double norm(const vec &)
Definition: tensor.cpp:302

sp_d
concept sp_d
Definition: suanPan.h:318

suanpan_debug
#define suanpan_debug(...)
Definition: suanPan.h:295