32 #ifndef SPARSEMATCUDA_HPP 33 #define SPARSEMATCUDA_HPP 37 #include <cusolverSp.h> 43 cusolverSpHandle_t handle =
nullptr;
44 cudaStream_t stream =
nullptr;
45 cusparseMatDescr_t descr =
nullptr;
47 void* d_val_idx =
nullptr;
48 void* d_col_idx =
nullptr;
49 void* d_row_ptr =
nullptr;
55 void device_dealloc()
const;
67 int solve(Mat<
T>&, const Mat<T>&) override;
71 cusolverSpCreate(&handle);
72 cudaStreamCreate(&stream);
73 cusolverSpSetStream(handle, stream);
74 cusparseCreateMatDescr(&descr);
75 cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
76 cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
77 cudaMalloc(&d_row_ptr,
sizeof(
int) * (this->n_rows + 1));
81 if(handle) cusolverSpDestroy(handle);
82 if(stream) cudaStreamDestroy(stream);
83 if(descr) cusparseDestroyMatDescr(descr);
84 if(d_row_ptr) cudaFree(d_row_ptr);
88 const size_t n_val =
sizeof(ET) * csr_mat.n_elem;
89 const size_t n_col =
sizeof(
int) * csr_mat.n_elem;
91 cudaMalloc(&d_val_idx, n_val);
92 cudaMalloc(&d_col_idx, n_col);
94 cudaMemcpyAsync(d_val_idx, csr_mat.val_mem(), n_val, cudaMemcpyHostToDevice, stream);
95 cudaMemcpyAsync(d_col_idx, csr_mat.col_mem(), n_col, cudaMemcpyHostToDevice, stream);
96 cudaMemcpyAsync(d_row_ptr, csr_mat.row_mem(),
sizeof(int) * (csr_mat.n_rows + 1), cudaMemcpyHostToDevice, stream);
98 cudaDeviceSynchronize();
102 if(d_val_idx) cudaFree(d_val_idx);
103 if(d_col_idx) cudaFree(d_col_idx);
107 :
SparseMat<T>(in_row, in_col, in_elem) { acquire(); }
129 const size_t n_rhs = (std::is_same_v<T, float> ||
Precision::MIXED == this->
precision ?
sizeof(float) :
sizeof(
double)) * B.n_elem;
134 cudaMalloc(&d_b, n_rhs);
135 cudaMalloc(&d_x, n_rhs);
139 if(std::is_same_v<T, float>) {
140 cudaMemcpyAsync(d_b, B.memptr(), n_rhs, cudaMemcpyHostToDevice, stream);
142 for(
auto I = 0llu; I < B.n_elem; I += B.n_rows) { cusolverSpScsrlsvqr(handle,
int(this->
n_rows),
int(this->
triplet_mat.
n_elem), descr, (
float*)d_val_idx, (
int*)d_row_ptr, (
int*)d_col_idx, (
float*)d_b + I,
float(this->tolerance), 0, (
float*)d_x + I, &singularity); }
144 X.set_size(arma::size(B));
146 cudaMemcpyAsync(X.memptr(), d_x, n_rhs, cudaMemcpyDeviceToHost, stream);
148 cudaDeviceSynchronize();
151 cudaMemcpyAsync(d_b, B.memptr(), n_rhs, cudaMemcpyHostToDevice, stream);
153 for(
auto I = 0llu; I < B.n_elem; I += B.n_rows) { cusolverSpDcsrlsvqr(handle,
int(this->
n_rows),
int(this->
triplet_mat.
n_elem), descr, (
double*)d_val_idx, (
int*)d_row_ptr, (
int*)d_col_idx, (
double*)d_b + I, this->tolerance, 0, (
double*)d_x + I, &singularity); }
155 X.set_size(arma::size(B));
157 cudaMemcpyAsync(X.memptr(), d_x, n_rhs, cudaMemcpyDeviceToHost, stream);
159 cudaDeviceSynchronize();
162 X = arma::zeros(B.n_rows, B.n_cols);
164 mat full_residual = B;
166 auto multiplier =
norm(full_residual);
172 auto residual = conv_to<fmat>::from(full_residual / multiplier);
174 cudaMemcpyAsync(d_b, residual.memptr(), n_rhs, cudaMemcpyHostToDevice, stream);
176 for(
auto I = 0llu; I < B.n_elem; I += B.n_rows) { cusolverSpScsrlsvqr(handle,
int(this->
n_rows),
int(this->
triplet_mat.
n_elem), descr, (
float*)d_val_idx, (
int*)d_row_ptr, (
int*)d_col_idx, (
float*)d_b + I,
float(this->tolerance), 0, (
float*)d_x + I, &singularity); }
178 cudaMemcpyAsync(residual.memptr(), d_x, n_rhs, cudaMemcpyDeviceToHost, stream);
180 cudaDeviceSynchronize();
182 const mat incre = multiplier * conv_to<mat>::from(residual);
186 suanpan_debug(
"mixed precision algorithm multiplier: %.5E\n", multiplier =
arma::norm(full_residual -= this->
operator*(incre)));
190 if(d_b) cudaFree(d_b);
191 if(d_x) cudaFree(d_x);
double norm(const vec &)
Definition: tensorToolbox.cpp:302
void suanpan_debug(const char *M,...)
Definition: print.cpp:64
constexpr auto SUANPAN_SUCCESS
Definition: suanPan.h:165
unique_ptr< Material > make_copy(const shared_ptr< Material > &)
Definition: Material.cpp:359
A SparseMatCUDA class that holds matrices.
A SparseMat class that holds matrices.
Definition: SparseMat.hpp:34
concept sp_d
Definition: suanPan.h:231