@article {971, title = {Non-GPU-resident Dense Symmetric Indefinite Factorization}, journal = {Concurrency and Computation: Practice and Experience}, year = {2016}, month = {2016-11}, abstract = {We study various algorithms to factorize a symmetric indefinite matrix that does not fit in the core memory of a computer. There are two sources of the data movement into the memory: one needed for selecting and applying pivots and the other needed to update each column of the matrix for the factorization. It is a challenge to obtain high performance of such an algorithm when the pivoting is required to ensure the numerical stability of the factorization. For example, when factorizing each column of the matrix, a diagonal entry, which ensures the stability, may need to be selected as a pivot among the remaining diagonals, and moved to the leading diagonal by swapping both the corresponding rows and columns of the matrix. If the pivot is not in the core memory, then it must be loaded into the core memory. For updating the matrix, the data locality may be improved by partitioning the matrix. For example, a right-looking partitioned algorithm first factorizes the leading columns, called panel, and then uses the factorized panel to update the trailing submatrix. This algorithm only accesses the trailing submatrix after each panel factorization (instead of after each column factorization) and performs most of its floating-point operations (flops) using BLAS-3, which can take advantage of the memory hierarchy. However, because the pivots cannot be predetermined, the whole trailing submatrix must be updated before the next panel factorization can start. When the whole submatrix does not fit in the core memory all at once, loading the block columns into the memory can become the performance bottleneck. Similarly, the left-looking variant of the algorithm would require to update each panel with all of the previously factorized columns. This makes it a much greater challenge to implement an efficient out-of-core symmetric indefinite factorization compared with an out-of-core nonsymmetric LU factorization with partial pivoting, which only requires to swap the rows of the matrix and accesses the trailing submatrix after each in-core factorization (instead of after each panel factorization by the symmetric factorization). To reduce the amount of the data transfer, in this paper we uses the recently proposed left-looking communication-avoiding variant of the symmetric factorization algorithm to factorize the columns in the core memory, and then perform the partitioned right-looking out-of-core trailing submatrix updates. This combination may still require to load the pivots into the core memory, but it only updates the trailing submatrix after each in-core factorization, while the previous algorithm updates it after each panel factorization.Although these in-core and out-of-core algorithms can be applied at any level of the memory hierarchy, we apply our designs to the GPU and CPU memory, respectively. We call this specific implementation of the algorithm a non{\textendash}GPU-resident implementation. Our performance results on the current hybrid CPU/GPU architecture demonstrate that when the matrix is much larger than the GPU memory, the proposed algorithm can obtain significant speedups over the communication-hiding implementations of the previous algorithms.}, doi = {10.1002/cpe.4012}, author = {Ichitaro Yamazaki and Stanimire Tomov and Jack Dongarra} }