44 cusolverSpHandle_t handle =
nullptr;
45 cudaStream_t stream =
nullptr;
46 cusparseMatDescr_t descr =
nullptr;
48 cuda_ptr d_val_idx{}, d_col_idx{}, d_row_ptr{};
51 cusolverSpCreate(&handle);
52 cudaStreamCreate(&stream);
53 cusolverSpSetStream(handle, stream);
54 cusparseCreateMatDescr(&descr);
55 cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
56 cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
57 d_row_ptr =
cuda_ptr(
sizeof(
int),
static_cast<int>(this->
n_rows) + 1);
61 void release()
const {
62 if(handle) cusolverSpDestroy(handle);
63 if(stream) cudaStreamDestroy(stream);
64 if(descr) cusparseDestroyMatDescr(descr);
68 d_val_idx =
cuda_ptr(
sizeof(ET), csr_mat.n_elem);
69 d_col_idx =
cuda_ptr(
sizeof(
int), csr_mat.n_elem);
71 d_val_idx.
copy_from(csr_mat.val_mem(), stream);
72 d_col_idx.
copy_from(csr_mat.col_mem(), stream);
73 d_row_ptr.
copy_from(csr_mat.row_mem(), stream);
82 SparseMatCUDA(
const uword in_row,
const uword in_col,
const uword in_elem = 0)
83 :
SparseMat<
T>(in_row, in_col, in_elem) { init_config(); }
94 unique_ptr<MetaMat<T>>
make_copy()
override {
return std::make_unique<SparseMatCUDA>(*
this); }
98 const auto single_precision = std::is_same_v<T, float> ||
Precision::MIXED == this->setting.precision;
100 if(!this->factored) {
101 this->factored =
true;
105 const size_t unit_size = single_precision ?
sizeof(float) :
sizeof(double);
107 const cuda_ptr d_b{unit_size,
static_cast<int>(B.n_elem)}, d_x{unit_size,
static_cast<int>(B.n_elem)};
110 Col<int> code(B.n_cols);
112 if constexpr(std::is_same_v<T, float>) {
113 d_b.copy_from(B.memptr(), stream);
115 for(
auto I = 0llu, J = 0llu; I < B.n_elem; I += B.n_rows, ++J) code[J] = cusolverSpScsrlsvqr(handle,
static_cast<int>(this->n_rows), d_val_idx.size, descr, d_val_idx.get<
float>(), d_row_ptr.get(), d_col_idx.get(), d_b.get<
float>(I), std::numeric_limits<float>::epsilon(), 3, d_x.get<
float>(I), &singularity);
117 if(0 == code.max()) {
118 X.set_size(arma::size(B));
119 d_x.copy_to(X.memptr(), stream);
123 d_b.copy_from(B.memptr(), stream);
125 for(
auto I = 0llu, J = 0llu; I < B.n_elem; I += B.n_rows, ++J) code[J] = cusolverSpDcsrlsvqr(handle,
static_cast<int>(this->n_rows), d_val_idx.size, descr, d_val_idx.get<
double>(), d_row_ptr.get(), d_col_idx.get(), d_b.get<
double>(I), std::numeric_limits<double>::epsilon(), 3, d_x.get<
double>(I), &singularity);
127 if(0 == code.max()) {
128 X.set_size(arma::size(B));
129 d_x.copy_to(X.memptr(), stream);
133 X = arma::zeros(arma::size(B));
135 mat full_residual = B;
137 std::uint8_t counter{0};
138 while(counter++ < this->setting.iterative_refinement) {
139 const auto multiplier = norm(full_residual);
140 if(multiplier < this->setting.tolerance)
break;
141 suanpan_debug(
"Mixed precision algorithm multiplier: {:.5E}.\n", multiplier);
143 auto residual = conv_to<fmat>::from(full_residual / multiplier);
144 d_b.copy_from(residual.memptr(), stream);
146 for(
auto I = 0llu, J = 0llu; I < B.n_elem; I += B.n_rows, ++J) code[J] = cusolverSpScsrlsvqr(handle,
static_cast<int>(this->n_rows), d_val_idx.size, descr, d_val_idx.get<
float>(), d_row_ptr.get(), d_col_idx.get(), d_b.get<
float>(I), std::numeric_limits<float>::epsilon(), 3, d_x.get<
float>(I), &singularity);
147 if(0 != code.max())
break;
149 d_x.copy_to(residual.memptr(), stream);
150 full_residual = B - this->
operator*(X += multiplier * conv_to<mat>::from(residual));