@inproceedings {835, title = {Scaling Up Matrix Computations on Shared-Memory Manycore Systems with 1000 CPU Cores}, journal = {International conference on Supercomputing}, year = {2014}, month = {2014-06}, pages = {333-342}, publisher = {ACM}, address = {Munich, Germany}, abstract = {While the growing number of cores per chip allows researchers to solve larger scientific and engineering problems, the parallel efficiency of the deployed parallel software starts to decrease. This unscalability problem happens to both vendor-provided and open-source software and wastes CPU cycles and energy. By expecting CPUs with hundreds of cores to be imminent, we have designed a new framework to perform matrix computations for massively many cores. Our performance analysis on manycore systems shows that the unscalability bottleneck is related to Non-Uniform Memory Access (NUMA): memory bus contention and remote memory access latency. To overcome the bottleneck, we have designed NUMA-aware tile algorithms with the help of a dynamic scheduling runtime system to minimize NUMA memory accesses. The main idea is to identify the data that is, either read a number of times or written once by a thread resident on a remote NUMA node, then utilize the runtime system to conduct data caching and movement between different NUMA nodes. Based on the experiments with QR factorizations, we demonstrate that our framework is able to achieve great scalability on a 48-core AMD Opteron system (e.g., parallel efficiency drops only 3\% from one core to 48 cores). We also deploy our framework to an extreme-scale shared-memory SGI machine which has 1024 CPU cores and runs a single Linux operating system image. Our framework continues to scale well, and can outperform the vendor-optimized Intel MKL library by up to 750\%.}, isbn = {978-1-4503-2642-1}, doi = {10.1145/2597652.2597670}, author = {Fengguang Song and Jack Dongarra} }