suanPan
SparseMatCUDA.hpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * Copyright (C) 2017-2022 Theodore Chang
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  ******************************************************************************/
31 // ReSharper disable CppCStyleCast
32 #ifndef SPARSEMATCUDA_HPP
33 #define SPARSEMATCUDA_HPP
34 
35 #ifdef SUANPAN_CUDA
36 
37 #include <cusolverSp.h>
38 #include <cusparse.h>
39 #include "SparseMat.hpp"
40 #include "csr_form.hpp"
41 
42 template<sp_d T> class SparseMatCUDA final : public SparseMat<T> {
43  cusolverSpHandle_t handle = nullptr;
44  cudaStream_t stream = nullptr;
45  cusparseMatDescr_t descr = nullptr;
46 
47  void* d_val_idx = nullptr;
48  void* d_col_idx = nullptr;
49  void* d_row_ptr = nullptr;
50 
51  void acquire();
52  void release() const;
53 
54  template<sp_d ET> void device_alloc(csr_form<ET, int>&&);
55  void device_dealloc() const;
56 
57 public:
58  SparseMatCUDA(uword, uword, uword = 0);
60  SparseMatCUDA(SparseMatCUDA&&) noexcept = delete;
61  SparseMatCUDA& operator=(const SparseMatCUDA&) = delete;
62  SparseMatCUDA& operator=(SparseMatCUDA&&) noexcept = delete;
63  ~SparseMatCUDA() override;
64 
65  unique_ptr<MetaMat<T>> make_copy() override;
66 
67  int solve(Mat<T>&, const Mat<T>&) override;
68 };
69 
70 template<sp_d T> void SparseMatCUDA<T>::acquire() {
71  cusolverSpCreate(&handle);
72  cudaStreamCreate(&stream);
73  cusolverSpSetStream(handle, stream);
74  cusparseCreateMatDescr(&descr);
75  cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
76  cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
77  cudaMalloc(&d_row_ptr, sizeof(int) * (this->n_rows + 1));
78 }
79 
80 template<sp_d T> void SparseMatCUDA<T>::release() const {
81  if(handle) cusolverSpDestroy(handle);
82  if(stream) cudaStreamDestroy(stream);
83  if(descr) cusparseDestroyMatDescr(descr);
84  if(d_row_ptr) cudaFree(d_row_ptr);
85 }
86 
87 template<sp_d T> template<sp_d ET> void SparseMatCUDA<T>::device_alloc(csr_form<ET, int>&& csr_mat) {
88  const size_t n_val = sizeof(ET) * csr_mat.n_elem;
89  const size_t n_col = sizeof(int) * csr_mat.n_elem;
90 
91  cudaMalloc(&d_val_idx, n_val);
92  cudaMalloc(&d_col_idx, n_col);
93 
94  cudaMemcpyAsync(d_val_idx, csr_mat.val_mem(), n_val, cudaMemcpyHostToDevice, stream);
95  cudaMemcpyAsync(d_col_idx, csr_mat.col_mem(), n_col, cudaMemcpyHostToDevice, stream);
96  cudaMemcpyAsync(d_row_ptr, csr_mat.row_mem(), sizeof(int) * (csr_mat.n_rows + 1), cudaMemcpyHostToDevice, stream);
97 
98  cudaDeviceSynchronize();
99 }
100 
101 template<sp_d T> void SparseMatCUDA<T>::device_dealloc() const {
102  if(d_val_idx) cudaFree(d_val_idx);
103  if(d_col_idx) cudaFree(d_col_idx);
104 }
105 
106 template<sp_d T> SparseMatCUDA<T>::SparseMatCUDA(const uword in_row, const uword in_col, const uword in_elem)
107  : SparseMat<T>(in_row, in_col, in_elem) { acquire(); }
108 
109 template<sp_d T> SparseMatCUDA<T>::SparseMatCUDA(const SparseMatCUDA& other)
110  : SparseMat<T>(other) { acquire(); }
111 
112 template<sp_d T> SparseMatCUDA<T>::~SparseMatCUDA() {
113  release();
114  device_dealloc();
115 }
116 
117 template<sp_d T> unique_ptr<MetaMat<T>> SparseMatCUDA<T>::make_copy() { return std::make_unique<SparseMatCUDA<T>>(*this); }
118 
119 template<sp_d T> int SparseMatCUDA<T>::solve(Mat<T>& X, const Mat<T>& B) {
120  if(!this->factored) {
121  // deallocate memory previously allocated for csr matrix
122  device_dealloc();
123 
124  std::is_same_v<T, float> || Precision::MIXED == this->precision ? device_alloc(csr_form<float, int>(this->triplet_mat)) : device_alloc(csr_form<double, int>(this->triplet_mat));
125 
126  this->factored = true;
127  }
128 
129  const size_t n_rhs = (std::is_same_v<T, float> || Precision::MIXED == this->precision ? sizeof(float) : sizeof(double)) * B.n_elem;
130 
131  void* d_b = nullptr;
132  void* d_x = nullptr;
133 
134  cudaMalloc(&d_b, n_rhs);
135  cudaMalloc(&d_x, n_rhs);
136 
137  int singularity;
138 
139  if(std::is_same_v<T, float>) {
140  cudaMemcpyAsync(d_b, B.memptr(), n_rhs, cudaMemcpyHostToDevice, stream);
141 
142  for(auto I = 0llu; I < B.n_elem; I += B.n_rows) { cusolverSpScsrlsvqr(handle, int(this->n_rows), int(this->triplet_mat.n_elem), descr, (float*)d_val_idx, (int*)d_row_ptr, (int*)d_col_idx, (float*)d_b + I, float(this->tolerance), 0, (float*)d_x + I, &singularity); }
143 
144  X.set_size(arma::size(B));
145 
146  cudaMemcpyAsync(X.memptr(), d_x, n_rhs, cudaMemcpyDeviceToHost, stream);
147 
148  cudaDeviceSynchronize();
149  }
150  else if(Precision::FULL == this->precision) {
151  cudaMemcpyAsync(d_b, B.memptr(), n_rhs, cudaMemcpyHostToDevice, stream);
152 
153  for(auto I = 0llu; I < B.n_elem; I += B.n_rows) { cusolverSpDcsrlsvqr(handle, int(this->n_rows), int(this->triplet_mat.n_elem), descr, (double*)d_val_idx, (int*)d_row_ptr, (int*)d_col_idx, (double*)d_b + I, this->tolerance, 0, (double*)d_x + I, &singularity); }
154 
155  X.set_size(arma::size(B));
156 
157  cudaMemcpyAsync(X.memptr(), d_x, n_rhs, cudaMemcpyDeviceToHost, stream);
158 
159  cudaDeviceSynchronize();
160  }
161  else {
162  X = arma::zeros(B.n_rows, B.n_cols);
163 
164  mat full_residual = B;
165 
166  auto multiplier = norm(full_residual);
167 
168  auto counter = 0u;
169  while(counter++ < this->refinement) {
170  if(multiplier < this->tolerance) break;
171 
172  auto residual = conv_to<fmat>::from(full_residual / multiplier);
173 
174  cudaMemcpyAsync(d_b, residual.memptr(), n_rhs, cudaMemcpyHostToDevice, stream);
175 
176  for(auto I = 0llu; I < B.n_elem; I += B.n_rows) { cusolverSpScsrlsvqr(handle, int(this->n_rows), int(this->triplet_mat.n_elem), descr, (float*)d_val_idx, (int*)d_row_ptr, (int*)d_col_idx, (float*)d_b + I, float(this->tolerance), 0, (float*)d_x + I, &singularity); }
177 
178  cudaMemcpyAsync(residual.memptr(), d_x, n_rhs, cudaMemcpyDeviceToHost, stream);
179 
180  cudaDeviceSynchronize();
181 
182  const mat incre = multiplier * conv_to<mat>::from(residual);
183 
184  X += incre;
185 
186  suanpan_debug("mixed precision algorithm multiplier: %.5E\n", multiplier = arma::norm(full_residual -= this->operator*(incre)));
187  }
188  }
189 
190  if(d_b) cudaFree(d_b);
191  if(d_x) cudaFree(d_x);
192 
193  return SUANPAN_SUCCESS;
194 }
195 
196 #endif
197 
198 #endif
199 
bool factored
Definition: MetaMat.hpp:41
Precision precision
Definition: MetaMat.hpp:45
double norm(const vec &)
Definition: tensorToolbox.cpp:302
triplet_form< T, uword > triplet_mat
Definition: MetaMat.hpp:50
constexpr auto SUANPAN_SUCCESS
Definition: suanPan.h:165
unique_ptr< Material > make_copy(const shared_ptr< Material > &)
Definition: Material.cpp:359
Definition: csc_form.hpp:23
unsigned refinement
Definition: MetaMat.hpp:47
A MetaMat class that holds matrices.
Definition: MetaMat.hpp:39
A SparseMatCUDA class that holds matrices.
A SparseMat class that holds matrices.
Definition: SparseMat.hpp:34
const index_t n_elem
Definition: triplet_form.hpp:130
concept sp_d
Definition: suanPan.h:231
const uword n_rows
Definition: MetaMat.hpp:52
double tolerance
Definition: MetaMat.hpp:43