Index: Make.atlas =================================================================== RCS file: Make.atlas diff -N Make.atlas --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ Make.atlas 20 Aug 2008 03:57:53 -0000 1.2 @@ -0,0 +1,66 @@ +################################################################## +# (C) Copyright IBM Corporation 2008 +# +################################################################## + +# Platform + +ARCH := atlas + +# Tools + +SHELL := /bin/sh +CD := cd +CP := cp +LN_S := ln -s +MKDIR := mkdir +TOUCH := touch + +CC := mpicc +LINKER := mpicc +ARCHIVER := /usr/bin/ar +RANLIB := echo + +# Directories + +INCdir := $(TOPdir)/include +BINdir := $(TOPdir)/bin/$(ARCH) + +# HPL library + +HPLlib := $(TOPdir)/lib/$(ARCH)/libhpl.a + +# MPI package + +MPdir := +MPinc := +MPlib := + +# Linear Algebra Library package -- Atlas + +LAdir := /usr/local/atlas +LAinc := -I$(LAdir)/include +LAlib := -L$(LAdir)/lib -lf77blas -latlas -lgfortran + +# F2C options + +F2CDEFS := -DAdd__ -DF77_INTEGER=int -DStringSunStyle + +# HPL options + +HPL_INCS := -I$(INCdir) -I$(INCdir)/$(ARCH) $(LAinc) $(MPinc) $(CSinc) +HPL_DEFS := $(F2CDEFS) $(HPL_OPTS) $(HPL_INCS) +HPL_DEFS += -DHPL_USE_HUGE_PAGES=1 + +ifdef TIMING +HPL_DEFS += -DHPL_DETAILED_TIMING +endif + +HPL_LIBS := $(HPLlib) $(LAlib) $(MPlib) $(CSlib) + +CCNOOPT := -m64 -Wall $(HPL_DEFS) +CCFLAGS := $(CCNOOPT) -O3 -fomit-frame-pointer -funroll-loops +#CCFLAGS := $(CCNOOPT) -O0 -ggdb -g3 +LINKFLAGS := $(CCFLAGS) +ARFLAGS := -r + Index: Make.qs22 =================================================================== RCS file: Make.qs22 diff -N Make.qs22 --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ Make.qs22 20 Aug 2008 03:57:53 -0000 1.7 @@ -0,0 +1,74 @@ +################################################################## +# (C) Copyright IBM Corporation 2008 +# +################################################################## + +# Platform + +ARCH := qs22 + +# Tools + +SHELL := /bin/sh +CD := cd +CP := cp +LN_S := ln -s +MKDIR := mkdir +TOUCH := touch + +CC := mpicc +LINKER := mpicc +ARCHIVER := /usr/bin/ar +RANLIB := echo + +# Directories + +INCdir := $(TOPdir)/include +BINdir := $(TOPdir)/bin/$(ARCH) + +# HPL library + +HPLlib := $(TOPdir)/lib/$(ARCH)/libhpl.a +ACLlib := $(TOPdir)/accel/lib/libhpl_accel_ppu.a + +# MPI package + +MPdir := +MPinc := +MPlib := + +# Linear Algebra Library package -- Atlas + +LAdir := /usr/local/atlas +LAinc := -I$(LAdir)/include +LAlib := -L$(LAdir)/lib -lf77blas -latlas -lgfortran + +# Cell SDK + +CSdir := /opt/cell/sdk/prototype +CSinc := -I$(CSdir)/usr/include +CSlib := -L$(CSdir)/usr/lib64 -lstdc++ -lpthread -lrt -lspe2 -lnuma + +# F2C options + +F2CDEFS := -DAdd__ -DF77_INTEGER=int -DStringSunStyle + +# HPL options + +HPL_INCS := -I$(INCdir) -I$(INCdir)/$(ARCH) $(LAinc) $(MPinc) $(CSinc) +HPL_DEFS := $(F2CDEFS) $(HPL_OPTS) $(HPL_INCS) +HPL_DEFS += -DHPL_USE_HUGE_PAGES=1 + +ifdef TIMING +HPL_DEFS += -DHPL_DETAILED_TIMING +endif + +HPL_LIBS := $(HPLlib) $(LAlib) $(MPlib) $(CSlib) $(ACLlib) + +CCNOOPT := -m64 -Wall $(HPL_DEFS) +CCNOOPT += -DHPL_CALL_ACCEL +CCFLAGS := $(CCNOOPT) -O3 -fomit-frame-pointer -funroll-loops +#CCFLAGS := $(CCNOOPT) -O0 -ggdb3 +LINKFLAGS := $(CCFLAGS) +ARFLAGS := -r + Index: Make.qs22_sdkblas =================================================================== RCS file: Make.qs22_sdkblas diff -N Make.qs22_sdkblas --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ Make.qs22_sdkblas 7 Aug 2008 13:07:08 -0000 1.4 @@ -0,0 +1,78 @@ +################################################################## +# Licensed Materials - Property of IBM. +# (C) Copyright IBM Corporation 2007 +# All Rights Reserved. +# +# US Government Users Restricted Rights - +# Use, duplication or disclosure restricted by +# GSA ADP Schedule Contract with IBM Corporation. + +################################################################## + +# Platform + +ARCH := qs22_sdkblas + +# Tools + +SHELL := /bin/sh +CD := cd +CP := cp +LN_S := ln -s +MKDIR := mkdir +TOUCH := touch + +CC := mpicc +LINKER := mpicc +ARCHIVER := /usr/bin/ar +RANLIB := echo + +# Directories + +INCdir := $(TOPdir)/include +BINdir := $(TOPdir)/bin/$(ARCH) + +# HPL library + +HPLlib := $(TOPdir)/lib/$(ARCH)/libhpl.a + +# MPI package + +MPdir := +MPinc := +MPlib := + +# Linear Algebra Library package + +LAdir := /usr +LAinc := -I$(LAdir)/include +LAlib := -L$(LAdir)/lib64 -lblas + +# Cell SDK + +CSdir := /opt/cell/sdk/prototype +CSinc := -I$(CSdir)/usr/include +CSlib := -L$(CSdir)/usr/lib64 -lstdc++ -lpthread -lrt -lspe2 -lnuma + +# F2C options + +F2CDEFS := -DAdd__ -DF77_INTEGER=int -DStringSunStyle + +# HPL options + +HPL_INCS := -I$(INCdir) -I$(INCdir)/$(ARCH) $(LAinc) $(MPinc) $(CSinc) +HPL_DEFS := $(F2CDEFS) $(HPL_OPTS) $(HPL_INCS) +HPL_DEFS += -DHPL_USE_HUGE_PAGES=1 + +ifdef TIMING +HPL_DEFS += -DHPL_DETAILED_TIMING +endif + +HPL_LIBS := $(HPLlib) $(LAlib) $(MPlib) $(CSlib) + +CCNOOPT := -m64 -Wall $(HPL_DEFS) +CCFLAGS := $(CCNOOPT) -O3 -fomit-frame-pointer -funroll-loops +#CCFLAGS := $(CCNOOPT) -O0 -ggdb -g3 +LINKFLAGS := $(CCFLAGS) +ARFLAGS := -r + Index: Make.top =================================================================== RCS file: /cvsroot/hpl_qs22/Make.top,v retrieving revision 1.1 retrieving revision 1.4 diff -u -r1.1 -r1.4 --- Make.top 10 Feb 2008 21:45:50 -0000 1.1 +++ Make.top 26 Aug 2008 13:24:26 -0000 1.4 @@ -43,6 +43,8 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ###################################################################### +# Modifications (C) Copyright IBM Corporation 2008 +# ###################################################################### # arch = UNKNOWN # @@ -51,6 +53,7 @@ ## build ############################################################### # build_src : + ( $(CD) src/accel/$(arch); $(MAKE) ) ( $(CD) src/auxil/$(arch); $(MAKE) ) ( $(CD) src/blas/$(arch); $(MAKE) ) ( $(CD) src/comm/$(arch); $(MAKE) ) @@ -78,6 +81,7 @@ - $(MKDIR) bin/$(arch) # startup_src : + - $(MAKE) -f Make.top leaf le=src/accel arch=$(arch) - $(MAKE) -f Make.top leaf le=src/auxil arch=$(arch) - $(MAKE) -f Make.top leaf le=src/blas arch=$(arch) - $(MAKE) -f Make.top leaf le=src/comm arch=$(arch) @@ -98,6 +102,7 @@ ## refresh ############################################################# # refresh_src : + - $(CP) makes/Make.accel src/accel/$(arch)/Makefile - $(CP) makes/Make.auxil src/auxil/$(arch)/Makefile - $(CP) makes/Make.blas src/blas/$(arch)/Makefile - $(CP) makes/Make.comm src/comm/$(arch)/Makefile @@ -118,6 +123,7 @@ ## clean ############################################################### # clean_src : + - ( $(CD) src/accel/$(arch); $(MAKE) clean ) - ( $(CD) src/auxil/$(arch); $(MAKE) clean ) - ( $(CD) src/blas/$(arch); $(MAKE) clean ) - ( $(CD) src/comm/$(arch); $(MAKE) clean ) @@ -138,6 +144,7 @@ ## clean_arch ########################################################## # clean_arch_src : + - $(RM) -r src/accel/$(arch) - $(RM) -r src/auxil/$(arch) - $(RM) -r src/blas/$(arch) - $(RM) -r src/comm/$(arch) @@ -165,6 +172,7 @@ ## clean_guard ######################################################### # clean_guard_src : + - ( $(CD) src/accel/$(arch); $(RM) *.grd ) - ( $(CD) src/auxil/$(arch); $(RM) *.grd ) - ( $(CD) src/blas/$(arch); $(RM) *.grd ) - ( $(CD) src/comm/$(arch); $(RM) *.grd ) Index: Makefile =================================================================== RCS file: /cvsroot/hpl_qs22/Makefile,v retrieving revision 1.1 retrieving revision 1.4 diff -u -r1.1 -r1.4 --- Makefile 10 Feb 2008 21:45:50 -0000 1.1 +++ Makefile 26 Aug 2008 13:24:26 -0000 1.4 @@ -43,12 +43,16 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ###################################################################### +# Modifications (C) Copyright IBM Corporation 2008 +# ###################################################################### # # SHELL = /bin/sh # arch = UNKNOWN # +export TOPdir = $(shell pwd) +# ## Targets ############################################################# # all : install @@ -70,10 +74,12 @@ # build : $(MAKE) -f Make.top build_src arch=$(arch) + $(MAKE) -C accel arch=$(arch) $(MAKE) -f Make.top build_tst arch=$(arch) # clean : $(MAKE) -f Make.top clean_src arch=$(arch) + $(MAKE) -C accel clean arch=$(arch) $(MAKE) -f Make.top clean_tst arch=$(arch) # clean_arch : Index: accel/Makefile =================================================================== RCS file: accel/Makefile diff -N accel/Makefile --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/Makefile 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,25 @@ +# --------------------------------------------------------------- +# (C) Copyright IBM Corporation 2007,2008 +# +# All Rights Reserved. +# --------------------------------------------------------------- + +ifeq ($(arch),qs22) + +######################################################################## +# Target +######################################################################## + +DIRS = lib + +######################################################################## +# make.footer +######################################################################## + +include $(CELL_TOP)/buildutils/make.footer + +else + +all clean : + +endif Index: accel/lib/Makefile =================================================================== RCS file: accel/lib/Makefile diff -N accel/lib/Makefile --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/Makefile 20 Aug 2008 03:57:53 -0000 1.5 @@ -0,0 +1,39 @@ +# --------------------------------------------------------------- +# (C) Copyright IBM Corporation 2007,2008 +# +# --------------------------------------------------------------- + +######################################################################## +# Subdirectories +######################################################################## + +DIRS = spu + +######################################################################## +# Target +######################################################################## + +TARGET_PROCESSOR = ppu64 +LIBRARY = libhpl_accel_ppu.a + +#CC_OPT_LEVEL = -g + +CPPFLAGS = -DNDEBUG +#CPPFLAGS += -DACCEL_LITTLE_ENDIAN +CPPFLAGS += -DVALIDATE_4GB_CROSSING +CPPFLAGS += -DMATRIX_4GB_CROSSING +#CPPFLAGS += -DPANEL_4GB_CROSSING + +######################################################################## +# Local Defines +######################################################################## + +SYS_LIBS += -lspe2 -lpthread -lm + +IMPORTS = spu/hpl_accel_spu-embed64.o + +######################################################################## +# make.footer +######################################################################## + +include $(CELL_TOP)/buildutils/make.footer Index: accel/lib/hpl_accel.h =================================================================== RCS file: accel/lib/hpl_accel.h diff -N accel/lib/hpl_accel.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel.h 20 Aug 2008 03:57:53 -0000 1.13 @@ -0,0 +1,758 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#ifndef _HPL_ACCEL_H_ +#define _HPL_ACCEL_H_ + +#define M_SUB (64) /* Size of sub-blocks - M_SUB x M_SUB */ + +/* ---------------------------------------------------------------- */ +/* Inline functions for addressing matrix storage of various formats*/ +/* ---------------------------------------------------------------- */ + +/* The following inline functions compute an array index for the each + * of the supported formats - column ordered, row ordered, and blocked + * (column ordered blocks, whose blocks are row ordered). + * The inputs are the row (row), the column (col), the leading dimension + * (ld). + */ + +/* ld is the number of elements from column n to column n+1 + */ +static inline unsigned int INDEX_COL(unsigned int row, unsigned int col, unsigned int ld) { + return (col*ld + row); +} + +/* ld is the number of elements from row n to row n+1 + */ +static inline unsigned int INDEX_ROW(unsigned int row, unsigned int col, unsigned int ld) { + return (row*ld + col); +} + +/* ld is the number of elements from block column n to block column n+1. + * This can also be described as the number of elements between column + * n and column n+M_SUB + */ +static inline unsigned int INDEX_BLK(unsigned int row, unsigned int col, unsigned int ld) { + return ((col / M_SUB)*ld + INDEX_ROW( row, (col % M_SUB), M_SUB )); +} + + +/* NOTE 1: + * + * The following defines can be used to configure the code for handling + * 4GB crossings. They include: + * + * MATRIX_4GB_CROSSING If defined then all block ordered matrices can cross a 4GB + * address boundary. However, the crossing can only occur on a + * block boundary, never within a matrix block. In addition, + * the block leading dimension must be no larger than 2^28 - 1. + * If not defined, then a matrix can not cross a 4GB + * address boundary. + * + * PANEL_4GB_CROSSING If defined then all row or column order panels (this includes + * U panels, L panels,and row buffers) may cross at most 1 4GB + * address boundary, but only on a row/column boundary. In addition, + * the leading dimension must not exceed 2^28 - 1. + * If not defined, then a panel can not cross a 4GB address boundary. + * + * VALIDATE_4GB_CROSSING If defined, then include code to validate the specified + * boundary constraints. This define is intended for debug + * purposes only. + */ + +#ifdef __PPU__ + +#include + +/* hpl_accel_byte_swap + * ------------------- + * Convert a double from little-endian format to big-endian format. This + * function is not optimal. Recommend using hpl_accel_byte_swap_load and + * hpl_accel_byte_swap_store instead. + */ +static inline double hpl_accel_byte_swap(double d) { +#ifdef ACCEL_LITTLE_ENDIAN + union { + unsigned long long ull; + double d; + } in, out; + + in.d = d; + out.ull = __ldbrx(&in.ull); + return (out.d); +#else + return (d); +#endif +} + + +/* hpl_accel_byte_swap_load + * ------------------------ + * Load a little endian byte ordered, double word value. + */ +static inline double hpl_accel_byte_swap_load(unsigned long long *ptr) +{ +#ifdef ACCEL_LITTLE_ENDIAN + union { + unsigned long long ull; + double d; + } x; + + x.ull = __ldbrx(ptr); + return (x.d); +#else + return (*((double *)ptr)); +#endif +} + + +/* hpl_accel_byte_swap_store + * ------------------------- + * Store a double word value in little endian byte ordering. + */ +static inline void hpl_accel_byte_swap_store(unsigned long long *ptr, double d) +{ +#ifdef ACCEL_LITTLE_ENDIAN + union { + unsigned long long ull; + double d; + } x; + + x.d = d; + __stdbrx(ptr, x.ull); +#else + *((double *)ptr) = d; +#endif +} + + +/* hpl_accel_init + * -------------- + * Initialize the HPL accelerator. If the accelerator is successfully + * initialized, then HPL_ACCEL_INIT_SUCCESS is returned, otherwise + * HPL_ACCEL_INIT_FAIL is returned. + */ + +#define HPL_ACCEL_INIT_SUCCESS 0 +#define HPL_ACCEL_INIT_FAIL -1 + +extern int hpl_accel_init(); + +/* hpl_accel_fini + * -------------- + * Finalize the HPL accelerator. If the accelerator successfully + * finishes , then HPL_ACCEL_FINI_SUCCESS is returned, otherwise + * HPL_ACCEL_FINI_FAIL is returned. + */ +#define HPL_ACCEL_FINI_SUCCESS 0 +#define HPL_ACCEL_FINI_FAIL -1 + +extern int hpl_accel_fini(); + + +/* hpl_accel_dgemm_CL_R_B_CL + * hpl_accel_dgemm_CL_B_B_CL + * ------------------------- + * Specialized accelerated DGEMM. The DGEMM computes: + * + * [c] -= [a]*[b] + * + * If a panel is specified, then the output in placed in [panel]: + * + * [panel] = [c] - [a]*[b]; + * + * m Number of rows in [a], [c], and [panel]. + * n Number of cols in [b], [c], and [panel]. + * k Number of cols in [a] and rows in [b]. + * a Column-ordered, little-endian, matrix of m rows and k columns. + * lda Leading dimension of matrix [a]. + * b Big endian matrix of k rows and n columns. This is either row ordered, + * in the case of hpl_accel_dgemm_CL_R_B_CL, or block formatted, in the + * hpl_accel_dgemm_CL_B_B_CL. + * ldb Leading dimension of matrix [b]. For a block formatted [b] matrix, + * this is the number of doubles to advance b from block column n to + * column n+1. + * c Block-formatted, big-endian, matrix of m rows and n columns. + * The block contents are row-ordered with the individual blocks + * that are column-ordered. Blocks are 64x64. + * ldc Leading block dimension of matrix [c]. The number of doubles to + * to advance c from block column n to column n+1. + * blk_row Starting block matrix row offset. This offset is applied only to the + * [c] matrix. + * blk_col Starting block matrix column offset. This offset is applied to the [c] + * matrix and [b] matrix when it is block formatted (i.e. for + * hpl_accel_dgemm_CL_B_B_CL. + * panel Column ordered, little endian DGEMM result matrix of m rows and n columns. + * If NULL, the result is returned in [c]. + * ldp Leading dimension of [panel]. If [panel] is NULL, this must be 0. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * a Buffer may not straddle 4GB boundary (See Note 1). + * b Buffer may not straddle 4GB boundary (See Note 1). + * c Buffer may not straddle 4GB boundary (See Note 1). + * panel Buffer may not straddle 4GB boundary (See Note 1). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * k Must be 128 to be accelerated. + * m Optimal if a multiple of 64. Integral multiples of 64 may be accelerated. + * n Optimal if a multiple of 64. Integral multiples of 64 may be accelerated. + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * b Optimal if cacheline aligned. Accelerated if [b] is quadword aligned. + * c Optimal if cacheline aligned. Accelerated if [c] is quadword aligned. + * panel Optimal if cacheline aligned. Accelerated if [panel] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + * ldb Optimal if a multiple of 16. Accelerated if ldb is even. + * ldc Optimal if a multiple of 16. Accelerated if ldc is even. + * ldp Optimal if a multiple of 16. Accelerated if ldp is even. + * blk_row Must be a multiple of M_SUB in order to be SPE accelerated. + * blk_col Must be a multiple of M_SUB in order to be SPE accelerated. + */ + +extern void hpl_accel_dgemm_CL_R_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp, + unsigned long long *incomplete); + +extern void hpl_accel_dgemm_CL_B_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp, + unsigned long long *incomplete); + + +/* hpl_accel_dgemm_C_C_C + * ------------------------- + * Specialized accelerated DGEMM. The DGEMM computes: + * + * [c] -= [a]*[b] + * + * m Number of rows in [a] and [c]. + * n Number of cols in [b] and [c]. + * k Number of cols in [a] and rows in [b]. + * a Column-ordered, big-endian, matrix of m rows and k columns. + * lda Leading dimension of matrix [a]. + * b Column-ordered, big endian matrix of k rows and n columns. + * ldb Leading dimension of matrix [b]. + * c Column-ordered, big-endian, matrix of m rows and n columns. + * ldc Leading block dimension of matrix [c]. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * a Buffer may not straddle 4GB boundary (See Note 1). + * c Buffer may not straddle 4GB boundary (See Note 1). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * k Must be a multiple of 4 and no bigger than 64 to be accelerated. + * m Optimal if a multiple of 16. Integral multiples of 8 may be accelerated. + * n Optimal if a multiple of 4. + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * b Optimal if cacheline aligned. Accelerated if [b] is quadword aligned. + * c Optimal if cacheline aligned. Accelerated if [c] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + * ldb Optimal if a multiple of 16. Accelerated if ldb is even. + * ldc Optimal if a multiple of 16. Accelerated if ldc is even. + */ + +extern void hpl_accel_dgemm_C_C_C(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete); + +/* hpl_accel_dtrsm_CL_R_B + * ---------------------- + * Specialized accelerated DTRSM. The DTRSM solves for [x] the matrix equation + * + * [a]*[x] = [b] + * + * where a is unit lower triangle matrix. The solution is returned in [b] unless + * [c] is non-NULL, in which the solution is returned in [c]. + * + * m Number of rows in [b], number of column in [a]. + * n Number of columns in [b]. + * a Column-ordered, little-endian, unit lower triangle matrix of + * dimension lda rows by m columns. + * lda Leading dimension of matrix [a]. + * b Row-order, big-endian, matrix of m rows and n columns. On entry + * contains the right-hand side matrix and is overwritten by the + * solution matrix [x]. + * ldb Leading dimension of matrix [b]. + * c Block-formatted, big-endian, matrix. The block contents are + * row-ordered with the individual blocks that are column-ordered. + * Blocks are 64x64. If non-NULL, the solution is returned in the + * row of blocks in [c] instead of [b]. This must point to the start + * of a matrix block. + * ldc Leading block dimension of matrix [c]. The number of doubles to + * to advance c from block column to the next block column. If [c] + * is NULL, then ldc should also be 0. + * blk_row Starting [c] block matrix row offset. If [c] is NULL, then blk_row + * must also be 0. + * blk_col Starting [c] block matrix column offset. If [c] is NULL, then blk_col + * must also be 0. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * a Buffer may not straddle 4GB boundary (See Note 1). + * b Buffer may not straddle 4GB boundary (See Note 1). + * c Buffer may not straddle 4GB boundary (See Note 1). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * m Must be 128 to be accelerated. + * n Optimal if a multiple of 16. Integral multiples of 16 may be accelerated. + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * b Optimal if cacheline aligned. Accelerated if [b] is quadword aligned. + * c Optimal if cacheline aligned. Accelerated if [c] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + * ldb Optimal if a multiple of 16. Accelerated if ldb is even. Memory throughput + * is maximized if ldb is NOT an integral multiple of 256. + * ldc Optimal if a multiple of 16. Accelerated if ldc is even. + * blk_col Must be a multiple of 16 in order to be SPE accelerated. This is a current + * implementation restriction. + */ + +extern void hpl_accel_dtrsm_CL_R_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete); + + +/* hpl_accel_dtrsm_CL_B + * -------------------- + * Specialized accelerated DTRSM. The DTRSM solves for [x] the matrix equation + * + * [a]*[x] = [b] + * + * where a is unit lower triangle matrix. The solution is returned in [b]. + * + * m Number of rows in [b], number of column in [a]. + * n Number of columns in [b]. + * a Column-ordered, little-endian, unit lower triangle matrix of + * dimension lda rows by m columns. + * lda Leading dimension of matrix [a]. + * b Block-formatted, big-endian, matrix of m rows and n columns. + * The block contents are row-ordered with the individual blocks + * that are column-ordered. Blocks are 64x64. On entry contains + * the right-hand side matrix and is overwritten by the + * solution matrix [x]. This must point to the start + * of a matrix block. + * ldb Leading dimension of matrix [b]. The number of doubles to + * to advance b from block column to the next block column. + * blk_row Starting [b] block matrix row offset. + * blk_col Starting [b] block matrix column offset. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * a Buffer may not straddle 4GB boundary (See Note 1). + * b Buffer may not straddle 4GB boundary (See Note 1). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * m Must be 128 to be accelerated. + * n Optimal if a multiple of 16. Integral multiples of 16 may be accelerated. + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * b Optimal if cacheline aligned. Accelerated if [b] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + * ldb Optimal if a multiple of 16. Accelerated if ldb is even. + * blk_col Must be a multiple of 16 in order to be SPE accelerated. This is a current + * implementation restriction. + */ + +extern void hpl_accel_dtrsm_CL_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete); + + + +/* hpl_accel_reform_panel_CL_to_B + * ------------------------------- + * Copy and reformat the L panel from the panel buffer pointed to by panel into matrix [a]. + * The input L panel is assumed to be column-order, little endian with a leading dimension of ldp. + * The matrix is assumed to be constructed in 64x64 element, row-ordered, big-endian blocks. The blocks + * are assumed to be column ordered. + * + * m Number of rows of panel to copy to [a] + * n Number of columns of panel to copy to [a] + * a Block formatted matrix. a points to the location with [a] to receive the + * data being copied and reformatted from panel + * lda Leading dimension of matrix [a]. This contains the number of doubles to + * advance a from block column n to column n+1. + * panel Pointer to the L panel containing the data to be reformatted and copied to + * matrix [a]. The [panel] is column-ordered, little endian. + * ldp Leading dimension of the panel. This is the number of doubles between + * column n and column n+1 + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * + */ + +void hpl_ref_reform_panel_CL_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete); + + +/* hpl_accel_reform_matrix_CL_to_B + * ------------------------------- + * Inplace reformat the matrix [a] from column-ordered, little-endian to blocked, big-endian format. The blocked + * format is 64x64, row-ordered blocks with the blocks being column ordered. The pad between the columns of + * blocks are zero filled. + * + * m Number of rows in [a]. If m is not a multiple of 64, then the additional rows needed + * pad [a] to a multiple of 64 rows are zero'd. + * n Number of cols in [a]. + * a Column-ordered, little-endian, matrix of m rows and n columns. + * lda Leading dimension of matrix [a]. + * scratch Scratch buffer used to assist the reformating of [a]. The scratch buffer + * must be at least 64*roundup(m,64) elements. + * size The size (number of elements) of the scratch buffer. The scratch buffer + * must be at least approximately 64*m elements. In general, better performance + * is achieved if the scratch buffer is larger and more SPEs can be deployed + * to the problem. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * n Must be an integral multiple of 64. + * a Must be quadword aligned and buffer may not straddle 4GB boundary (See Note 1). + * lda Must be even and at least roundup(m,64). + * scratch Must be quadword aligned and must not straddle 4GB boundary. + * size Must be at least 64*roundup(m,64). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * a Mush be cacheline aligned. + * lda Must be a multiple of 16. + * scratch Must be cacheline aligned. + * size Must be at least 4*64*m for optimal performance. + * + * Note: For 4GB crossing support, the matrix a is considered to be a block "matrix". + */ +extern void hpl_accel_reform_matrix_CL_to_B(int m, int n, + double *a, int lda, + double *scratch, int size, + unsigned long long *incomplete); + + + +/* hpl_accel_reform_panel_B_to_CL + * ------------------------------- + * Copy and reformat the L panel from matrix [a] into the panel buffer pointed to by panel. + * The matrix is assumed to be constructed in 64x64 element, row-ordered, big-endian blocks. The blocks + * are assumed to be column ordered. The output L panel is assumed to be column-order, little endian + * with a leading dimension of ldp. + * + * m Number of rows of [a] to copy to panel + * n Number of columns of [a] to copy to panel + * panel Pointer to the L panel extracted and reformatted from matrix [a]. The + * [panel] is column-ordered, little-endian. + * ldp Leading dimension of the panel. + * a Block formatted matrix. a points to the start of the panel to be reformatted + * and copied into [panel]. + * lda Leading dimension of matrix [a]. This contains the number of doubles to + * advance a from block column n to column n+1. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * m Must be a multiple of 64. + * panel Must be quadword aligned and the buffer may not straddle 4GB boundary (See Note 1). + * ldp Must be even and at least m. + * a Must be quadword aligned and may not straddle a 4GB boundary (See Note 1). + * lda Must be even and at least m*M_SUB. + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * panel Must be cacheline aligned. + * ldp Must be a multiple of 16. + * a Mush be cacheline aligned. + * lda Must be a multiple of 16. + */ +extern void hpl_accel_reform_panel_B_to_CL(int m, int n, + double *panel, int ldp, + double *a, int lda, + unsigned long long *incomplete); + + + +/* hpl_accel_reform_panel_R_to_B + * ------------------------------- + * Copy and reformat a U panel from a row buffer pointed to by panel into matrix [a]. + * The input U panel is assumed to be row-order, big endian with a leading dimension of ldp. + * The matrix is assumed to be constructed in 64x64 element, row-ordered, big-endian blocks. + * The blocks are assumed to be column ordered. + * + * m Number of rows of panel to copy to [a] + * n Number of columns of panel to copy to [a] + * a Block formatted matrix. a points to the location with [a] to receive the + * data being copied and reformatted from panel + * lda Leading dimension of matrix [a]. This contains the number of doubles to + * advance a from block column n to column n+1. + * panel Pointer to the U panel containing the data to be reformatted and copied to + * matrix [a]. The [panel] is row-ordered, big-endian. + * ldp Leading dimension of the panel. This is the number of doubles between + * row n and row n+1 + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * m None + * panel Must be quadword aligned and the buffer may not straddle 4GB boundary (See Note 1). + * ldp Must be even and at least n. + * a Must be quadword aligned and may not straddle a 4GB boundary (See Note 1). + * lda Must be even and at least m*M_SUB. + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * panel Must be cacheline aligned. + * ldp Must be a multiple of 16. + * a Must be cacheline aligned. + * lda Must be a multiple of 16. + */ +extern void hpl_accel_reform_panel_R_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete); + + +/* hpl_accel_reform_rows_R_to_B + * hpl_accel_reform_rows_B_to_R + * ---------------------------- + * Copy and reformat a set of rows between row ordered and block ordered formats. + * hpl_accel_reform_rows_R_to_B reformats rows into blocks and hpl_accel_reform_rows_B_to_R + * reformats blocks into rows. These functions are expected to be used to gather/scatter winners + * and losers when pivoting so that rows are coalesced into large DMAs for efficient transfer. + * No endian swapping is performed on the data. Block data is assumed to be ordered in 64x64, + * row ordered elements. The blocks themselves are column ordered. + * + * m Number of rows to copy. Specifies the number of entries in the blk_rows array. + * n Number of values (doubles) per row to copy. + * rows Pointer to the data rows to be reformatted and copied to/from matrix [a]. + * ldr Leading dimension of the row buffer. This is the number of doubles between + * rows of the [rows] buffer. + * a Block formatted matrix. + * lda Leading dimension of matrix [a]. This contains the number of doubles to + * advance a from block column n to column n+1. + * blk_rows Array of row indices. blk_rows specifies starting [a] block matrix row offset + * for each of the m rows. + * blk_col Starting [a] block matrix column offset. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * rows Buffer must not straddle 4GB boundary. + * a Buffer may not straddle 4GB boundary (See Note 1). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * rows Optimal if cacheline aligned. Accelerated if [rows] is quadword aligned. + * ldr Optimal if a multiple of 16. Accelerated if ldr is even. + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + * blk_col Optimal if a multiple of 16. Accelerated if blk_col is even. + */ + +extern void hpl_accel_reform_rows_R_to_B(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete); + +extern void hpl_accel_reform_rows_B_to_R(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete); + +/* hpl_accel_swap_rows_B_to_B + * ---------------------------- + * Swap a set of rows in block ordered format. + * hpl_accel_swap_rows_B_to_B swaps a set of rows pairwise in a block-formatted matrix. + * No endian swapping is performed on the data. Block data is assumed to be ordered in 64x64, + * row ordered elements. The blocks themselves are column ordered. + * + * m Number of rows to swap. Specifies the number of entries in the blk_rows array. + * n Number of values (doubles) per row to copy. + * a Block formatted matrix. + * lda Leading dimension of matrix [a]. This contains the number of doubles to + * advance a from block column n to column n+1. + * blk_rows Array of row indices. blk_rows specifies starting [a] block matrix row offset + * for each of the m rows. + * blk_col Starting [a] block matrix column offset. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * a Buffer may not straddle 4GB boundary (See Note 1). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + * blk_col Optimal if a multiple of 16. Accelerated if blk_col is even. + */ + +extern void hpl_accel_swap_rows_B_to_B(int m, int n, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete); + +/* hpl_accel_copy_rows_R_to_R + * ---------------------------- + * Copy a set of rows in row ordered format. + * hpl_accel_copy_rows_R_to_R copies a set of rows from row-oriented matrix a to + * row-oriented matrix b. + * No endian swapping is performed on the data. + * + * m Number of rows to copy. Specifies the number of entries in the blk_rows array. + * n Number of values (doubles) per row to copy. + * a Pointer to the source data rows to be copied to row-ordered matrix b. + * lda Leading dimension of the row-ordered source matrix a. + * b Pointer to the row-ordered destination matrix. + * ldb Leading dimension of the row-ordered destination matrix b. + * rows Array of row indices. rows specifies the destination row address in row-ordered + * matrix b to receive source row from matrix a. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + */ + +extern void hpl_accel_copy_rows_R_to_R(int m, int n, + double *a, int lda, + double *b, int ldb, + int *rows, + unsigned long long *incomplete); + +/* REFERENCE FUNCTIONS. + * + * These functions are non-accelerated implementations that run on the PPU. + * + * They may not place the same functional and performance restrictions as the + * SPU accelerated functions. + */ + +extern int hpl_ref_init(); + +extern void hpl_ref_dgemm_CL_R_B(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete); + +extern void hpl_ref_dgemm_CL_B_B(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete); + +extern void hpl_ref_dgemm_CL_R_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp, + unsigned long long *incomplete); + +extern void hpl_ref_dgemm_CL_B_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp, + unsigned long long *incomplete); + +extern void hpl_ref_dgemm_C_C_C(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete); + +extern void hpl_ref_dtrsm_CL_R(int m, int n, + const double *a, int lda, + double *b, int ldb, + unsigned long long *incomplete); + +extern void hpl_ref_dtrsm_CL_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete); + +extern void hpl_ref_dtrsm_CL_R_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete); + +extern void hpl_ref_reform_matrix_CL_to_B(int m, int n, + double *a, int lda, + double *scratch, int size, + unsigned long long *incomplete); + +extern void hpl_ref_reform_panel_B_to_CL(int m, int n, + double *panel, int ldp, + double *a, int lda, + unsigned long long *incomplete); + +extern void hpl_ref_reform_panel_R_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete); + +extern void hpl_ref_reform_rows_R_to_B(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete); + +extern void hpl_ref_reform_rows_B_to_R(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete); + +extern void hpl_ref_swap_rows_B_to_B(int m, int n, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete); + +extern void hpl_ref_copy_rows_R_to_R(int m, int n, + double *a, int lda, + double *b, int ldb, + int *rows, + unsigned long long *incomplete); + +#endif /* __PPU__ */ + +#endif /* _HPL_ACCEL_H_ */ Index: accel/lib/hpl_accel_copy.c =================================================================== RCS file: accel/lib/hpl_accel_copy.c diff -N accel/lib/hpl_accel_copy.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_copy.c 20 Aug 2008 03:57:53 -0000 1.4 @@ -0,0 +1,98 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include "hpl_accel.h" +#include "hpl_accel_spu.h" +#include "ppu_intrinsics.h" + +/* General purpose, reference, reformating facilities. + */ + +void hpl_accel_copy_rows_R_to_R(int m, int n, + double *a, int lda, + double *b, int ldb, + int *rows, + unsigned long long *incomplete) +{ + unsigned int non_aligned; + int n0 = 0; + + non_aligned = (((unsigned int)(lda | ldb) & 1) | + (((unsigned int)((uintptr_t)a) | (uintptr_t)b) & (16-1))); + + if ((non_aligned == 0) && (n > 1)) { + int m_start, m_left, m_per_cmd; + unsigned int idx; + volatile hpl_accel_copy_rows_parms_t *parms; + int i; + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + n0 = n & ~1; + + idx = hpl_accel_cmd_idx; + + m_start = 0; + m_left = m; + + /* Generate multiple command requests if the number of rows + * is greater than what will fit in a single command request. + */ + m_per_cmd = (int)(sizeof(parms->rows) / sizeof(int)); + + while (m_left > 0) { + + parms = (volatile hpl_accel_copy_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = (m_left < m_per_cmd) ? m_left : m_per_cmd; + parms->n = n0; + parms->lda = lda * sizeof(double); + parms->ldb = ldb * sizeof(double); + + parms->a = a + m_start * lda; + parms->b = b; + + parms->incomplete = (parms->m < m_left) ? NULL : incomplete; + + for (i=0; im; i++) parms->rows[i] = rows[m_start+i]; + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_COPY_ROWS_R_TO_R, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + m_start += parms->m; + m_left -= parms->m; + } + } else { + if (incomplete) *incomplete = 0; + } + + /* Cleanup portions of the rows not implemented by the SPEs above. + */ + if (n0 < n) { + unsigned int y1, y2, x; + double *src, *dst; + + /* For each of the rows */ + for (y1=0; y1<(unsigned int)m; y1++) { + y2 = rows[y1]; /* New location for row y1 */ + src = a + (y1 * lda); + dst = b + (y2 * ldb); + for (x=n0; x<(unsigned int)n; x++) { + dst[x] = src[x]; + } + } + } +} + Index: accel/lib/hpl_accel_dgemm.c =================================================================== RCS file: accel/lib/hpl_accel_dgemm.c diff -N accel/lib/hpl_accel_dgemm.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_dgemm.c 20 Aug 2008 03:57:53 -0000 1.12 @@ -0,0 +1,495 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ +#include +#include +#include "hpl_accel.h" +#include "hpl_accel_spu.h" +#include "hpl_accel_global.h" + +#include + +static void _dgemm_CL_R_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp) +{ + unsigned int i, x, y; + unsigned long long *a_ptr; + double a_val, b_val, *p; + + if (panel) { + /* Write the result into the panel buffer. We first perform the compution, + * placing the result into [panel]. Then byte swap panel. + */ + p = panel; + for (x=0; x<(unsigned int)n; x++, p += ldp-m) { + a_ptr = (unsigned long long *)a; + b_val = b[INDEX_ROW(0,x,ldb)]; + + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + *p++ = c[INDEX_BLK(y+blk_row,x+blk_col,ldc)] - a_val * b_val; + } + } + + a += lda; + for (i=1; i<(unsigned int)k; i++, a+=lda) { + p = panel; + for (x=0; x<(unsigned int)n; x++, p += ldp-m) { + a_ptr = (unsigned long long *)a; + b_val = b[INDEX_ROW(i,x,ldb)]; + + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + *p++ -= a_val * b_val; + } + } + } +#ifdef ACCEL_LITTLE_ENDIAN + /* Byte swap panel buffer + */ + unsigned long long *p_ptr = (unsigned long long *)panel; + for (x=0; x<(unsigned int)n; x++, p_ptr += ldp-m) { + for (y=0; y<(unsigned int)m; y++, p_ptr++) { + __stdbrx(p_ptr, *p_ptr); + } + } +#endif + } else { + /* Write the result into the c matrix. + */ + for (i=0; i<(unsigned int)k; i++, a+=lda) { + a_ptr = (unsigned long long *)a; + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_BLK(y+blk_row,x+blk_col,ldc)] -= a_val * b[INDEX_ROW(i,x,ldb)]; + } + } + } + } +} + + +void hpl_accel_dgemm_CL_R_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp, + unsigned long long *incomplete) +{ + int n0; + int m0 = 0; + unsigned int cmd; + unsigned int idx; + unsigned int aligned, bc, br; + volatile hpl_accel_dgemm_parms_t *parms; + + /* Do as much of the dgemm as possible using the blocked dgemm SPU specialist. + * This specialist assumes: + * m is at least M_SUB + * n is at least M_SUB + * k is equal to M + * a is quadword aligned. A multiple of 16 for optimal DMA performance + * b is quadword aligned. A multiple of 16 for optimal DMA performance + * c is quadword aligned. A multiple of 16 for optimal DMA performance + * panel is quadword aligned. A multiple of 16 for optimal DMA performance + * lda is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance + * ldb is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance + * ldc is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance + * ldp is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance + * blk_col is a multiple of M_SUB + * blk_row is a multiple of M_SUB + */ + bc = blk_col/M_SUB; + br = blk_row/M_SUB; + + c += (ldc * bc) + br*(M_SUB*M_SUB); + + blk_col %= M_SUB; + blk_row %= M_SUB; + + aligned = (blk_row | blk_col | + ((unsigned int)(lda | ldb | ldc | ldp) & 1) | + ((unsigned int)((uintptr_t)a | (uintptr_t)b | (uintptr_t)c | (uintptr_t)panel) & (16-1))); + + + if ((m >= M_SUB) && (n >= M_SUB) && (k == M) && (aligned == 0)) { + /* Either all or a portion of the computation can be done by the SPE accelerators. + */ + m0 = (m/M_SUB) * M_SUB; + n0 = (n/M_SUB) * M_SUB; + + /* Verify 4GB boundary expectation. + */ + VALIDATE_PANEL_4GB_CROSSING(a, k, lda); + VALIDATE_PANEL_4GB_CROSSING(b, k, ldb); + VALIDATE_MATRIX_4GB_CROSSING(c, m0, n0, ldc); + VALIDATE_PANEL_4GB_CROSSING(panel, n0, ldp); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_dgemm_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->a = a; + parms->b = b; + parms->c = c; + parms->lda = lda * sizeof(double); + parms->ldb = ldb * sizeof(double); + parms->ldc = ldc * sizeof(double); + parms->n = n0 / M_SUB; + parms->m = m0 / M_SUB; + parms->b_blk = 0; + parms->incomplete = incomplete; + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->a_count, a, lda, M); + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->b_count, b, ldb, M); + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->p_count, panel, ldp, n); + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + if (panel) { + parms->p = panel; + parms->ldp = ldp * sizeof(double); + cmd = HPL_ACCEL_CMD_DGEMM_PANEL; + } else { + cmd = HPL_ACCEL_CMD_DGEMM; + } + + /* Perform a sync in order to ensure that the parameters are written to + * memory before writing to the mailbox command queue. + */ + __sync(); + + send_cmd_to_spes(cmd, idx, HPL_ACCEL_SPES); + + /* Complete any remain portion on the right side. That is when n is not a multiple + * of M_SUB. + */ + if (n0 < n) { + _dgemm_CL_R_B_CL(m0, n-n0, k, a, lda, b+n0, ldb, c, ldc, blk_row, blk_col+n0, panel + n0*ldp, ldp); + } + } else { + /* This function is completely synchronous, therefore, clear incomplete. + */ + if (incomplete) *incomplete = 0; + } + + /* Cleanup any remaining portion of the matrix that was not handled above. + */ + if (m0 < m) { + _dgemm_CL_R_B_CL(m-m0, n, k, a+m0, lda, b, ldb, c, ldc, blk_row+m0, blk_col, ((panel) ? panel + m0 : panel), ldp); + } +} + + + +static void _dgemm_CL_B_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp) +{ + unsigned int i, x, y; + unsigned long long *a_ptr; + double a_val, b_val, *p; + + if (panel) { + /* Write the result into the panel buffer. We first perform the compution, + * placing the result into [panel]. Then byte swap panel. + */ + p = panel; + for (x=0; x<(unsigned int)n; x++, p += ldp-m) { + a_ptr = (unsigned long long *)a; + b_val = b[INDEX_BLK(0,x+blk_col,ldb)]; + + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + *p++ = c[INDEX_BLK(y+blk_row,x+blk_col,ldc)] - a_val * b_val; + } + } + + a += lda; + for (i=1; i<(unsigned int)k; i++, a+=lda) { + p = panel; + for (x=0; x<(unsigned int)n; x++, p += ldp-m) { + a_ptr = (unsigned long long *)a; + b_val = b[INDEX_BLK(i,x+blk_col,ldb)]; + + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + *p++ -= a_val * b_val; + } + } + } +#ifdef ACCEL_LITTLE_ENDIAN + /* Byte swap panel buffer + */ + unsigned long long *p_ptr = (unsigned long long *)panel; + for (x=0; x<(unsigned int)n; x++, p_ptr += ldp-m) { + for (y=0; y<(unsigned int)m; y++, p_ptr++) { + __stdbrx(p_ptr, *p_ptr); + } + } +#endif + } else { + /* Write the result into the c matrix. + */ + for (i=0; i<(unsigned int)k; i++, a+=lda) { + a_ptr = (unsigned long long *)a; + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_BLK(y+blk_row,x+blk_col,ldc)] -= a_val * b[INDEX_BLK(i,x+blk_col,ldb)]; + } + } + } + } +} + + +void hpl_accel_dgemm_CL_B_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp, + unsigned long long *incomplete) +{ + int n0; + int m0 = 0; + unsigned int cmd; + unsigned int idx; + unsigned int aligned, bc, br; + volatile hpl_accel_dgemm_parms_t *parms; + + /* Do as much of the dgemm as possible using the blocked dgemm SPU specialist. + * This specialist assumes: + * m is at least M_SUB + * n is at least M_SUB + * k is equal to M + * a is quadword aligned. A multiple of 16 for optimal DMA performance + * b is quadword aligned. A multiple of 16 for optimal DMA performance + * c is quadword aligned. A multiple of 16 for optimal DMA performance + * panel is quadword aligned. A multiple of 16 for optimal DMA performance + * lda is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * ldb is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * ldc is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * ldp is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * blk_col is a multiple of M_SUB + * blk_row is a multiple of M_SUB + */ + + bc = blk_col/M_SUB; + br = blk_row/M_SUB; + + c += (ldc * bc) + br*(M_SUB*M_SUB); + b += (ldb * bc); + + blk_col %= M_SUB; + blk_row %= M_SUB; + + aligned = (blk_row | blk_col | + ((unsigned int)(lda | ldb | ldc | ldp) & 1) | + ((unsigned int)((uintptr_t)a | (uintptr_t)b | (uintptr_t)c | (uintptr_t)panel) & (16-1))); + + + if ((m >= M_SUB) && (n >= M_SUB) && (k == M) && (aligned == 0)) { + /* Either all or a portion of the computation can be done by the SPE accelerators. + */ + m0 = (m/M_SUB) * M_SUB; + n0 = (n/M_SUB) * M_SUB; + + /* Verify 4GB boundary expectation. + */ + VALIDATE_PANEL_4GB_CROSSING(a, k, lda); + VALIDATE_MATRIX_4GB_CROSSING(b, k, n0, ldb); + VALIDATE_MATRIX_4GB_CROSSING(c, m0, n0, ldc); + VALIDATE_PANEL_4GB_CROSSING(panel, n0, ldp); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_dgemm_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->a = a; + parms->b = b; + parms->c = c; + parms->lda = lda * sizeof(double); + parms->ldb = ldb * sizeof(double); + parms->ldc = ldc * sizeof(double); + parms->n = n0 / M_SUB; + parms->m = m0 / M_SUB; + parms->b_blk = -1; + parms->incomplete = incomplete; + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->a_count, a, lda, M); + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->p_count, panel, ldp, n); + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + if (panel) { + parms->p = panel; + parms->ldp = ldp * sizeof(double); + cmd = HPL_ACCEL_CMD_DGEMM_PANEL; + } else { + cmd = HPL_ACCEL_CMD_DGEMM; + } + + /* Perform a sync in order to ensure that the parameters are written to + * memory before writing to the mailbox command queue. + */ + __sync(); + + send_cmd_to_spes(cmd, idx, HPL_ACCEL_SPES); + + /* Complete any remain portion on the right side. That is when n is not a multiple + * of M_SUB. + */ + if (n0 < n) { + _dgemm_CL_B_B_CL(m0, n-n0, k, a, lda, b, ldb, c, ldc, blk_row, blk_col+n0, panel + n0*ldp, ldp); + } + } else { + /* This function is completely synchronous, therefore, clear incomplete. + */ + if (incomplete) *incomplete = 0; + } + + /* Cleanup any remaining portion of the matrix that was not handled above. + */ + if (m0 < m) { + _dgemm_CL_B_B_CL(m-m0, n, k, a+m0, lda, b, ldb, c, ldc, blk_row+m0, blk_col, ((panel) ? panel + m0 : panel), ldp); + } +} + + +void _dgemm_C_C_C(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc) +{ + unsigned int i; + unsigned int x, y; + double a_val; + + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = a[INDEX_COL(y,i,lda)]; + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_COL(y,x,ldc)] -= a_val * b[INDEX_COL(i,x,ldb)]; + } + } + } +} + +void hpl_accel_dgemm_C_C_C(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete) +{ + int m0 = 0; + int spes; + unsigned int cmd, idx, aligned; + volatile hpl_accel_dgemm_parms_t *parms; + + /* Do as much of the dgemm as possible using the column-ordered dgemm SPU specialist. + * This specialist assumes: + * k is a multiple of 4 and less than or equal to 64 + * m is a multiple of 8 + * n is a multiple of 4 + * a is quadword aligned. A multiple of 16 for optimal DMA performance + * b is quadword aligned. A multiple of 16 for optimal DMA performance + * c is quadword aligned. A multiple of 16 for optimal DMA performance + * lda is even (qword aligned cols). A multiple of 16 for optimal DMA + * performance. + * ldb is even (qword aligned cols). A multiple of 16 for optimal DMA + * performance. + * ldc is even (qword aligned cols). A multiple of 16 for optimal DMA + * performance. + */ + + aligned = (((unsigned int)(lda | ldb | ldc) & 1) | + ((unsigned int)((uintptr_t)a | (uintptr_t)b | (uintptr_t)c) & (16-1))); + + if ((m >= 8) && (k <= 64) && (((k & (4-1)) | (n & (4-1))) == 0) && (aligned == 0)) { + /* Either all or a portion of the computation can be done by the SPE accelerators. + */ + m0 = (m/8) * 8; + + /* Verify 4GB boundary expectation. + */ + VALIDATE_PANEL_4GB_CROSSING(a, k, lda); + VALIDATE_PANEL_4GB_CROSSING(c, n, ldc); + + idx = hpl_accel_cmd_idx; + parms = (volatile hpl_accel_dgemm_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->a = a; + parms->b = b; + parms->c = c; + parms->lda = lda * sizeof(double); + parms->ldb = ldb * sizeof(double); + parms->ldc = ldc * sizeof(double); + parms->n = n; + parms->m = m0; + parms->k = k; + parms->incomplete = incomplete; + + /* Compute the number of SPES to deploy. Each SPE will need to compute + * at least one M_SUB high block. + */ + spes = (m + (M_SUB-1)) / M_SUB; + if (spes > HPL_ACCEL_SPES) spes = HPL_ACCEL_SPES; + + init_incomplete(incomplete, spes); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + cmd = HPL_ACCEL_CMD_DGEMM_C_C_C; + + /* Perform a sync in order to ensure that the parameters are written to + * memory before writing to the mailbox command queue. + */ + __sync(); + + send_cmd_to_spes(cmd, idx, spes); + + } else { + /* This function is completely synchronous, therefore, clear incomplete. + */ + if (incomplete) *incomplete = 0; + } + + /* Cleanup any remaining portion of the matrix that was not handled above. + */ + if (m0 < m) { + _dgemm_C_C_C(m-m0, n, k, a+m0, lda, b, ldb, c+m0, ldc); + } +} Index: accel/lib/hpl_accel_dtrsm.c =================================================================== RCS file: accel/lib/hpl_accel_dtrsm.c diff -N accel/lib/hpl_accel_dtrsm.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_dtrsm.c 20 Aug 2008 03:57:53 -0000 1.5 @@ -0,0 +1,250 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ +#include +#include +#include "hpl_accel.h" +#include "hpl_accel_spu.h" +#include "hpl_accel_global.h" + +#include + + + +void hpl_accel_dtrsm_CL_R_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete) +{ + int spes; + int spans; + int n0 = 0; + unsigned int idx; + unsigned int aligned; + unsigned int cmd; + volatile hpl_accel_dtrsm_parms_t *parms; + + /* Do as much of the dtrsm as possible using the dtrsm SPU specialist. + * This specialist assumes: + * m is at 128. + * n is a multiple of 16. + * a is quadword aligned. A multiple of 16 for optimal DMA performance + * b is quadword aligned. A multiple of 16 for optimal DMA performance + * c is quadword aligned. A multiple of 16 for optimal DMA performance + * lda is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * ldb is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * ldc is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + */ + c += (blk_row * M_SUB) + ldc*(blk_col / M_SUB); + blk_col %= M_SUB; + + aligned = (((unsigned int)(lda | ldb | ldc) & 1) | (blk_col & 15) | + ((unsigned int)((uintptr_t)a | (uintptr_t)b | (uintptr_t)c) & (16-1))); + + + if ((m == M) && (n > 15) && (aligned == 0)) { + /* Either all or a portion of the computation can be done by the SPE accelerators. + */ + spans = n/16; + n0 = spans * 16; + + /* Verify 4GB boundary expectation. + */ + VALIDATE_PANEL_4GB_CROSSING(a, m, lda); + VALIDATE_PANEL_4GB_CROSSING(b, m, ldb); + VALIDATE_MATRIX_4GB_CROSSING(c, m, n0, ldc); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_dtrsm_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->a = a; + parms->b = b; + parms->c = c; + parms->lda = lda * sizeof(double); + parms->ldb = ldb * sizeof(double); + parms->ldc = ldc * sizeof(double); + parms->n = n0; + parms->m = m / M; + parms->blk_col = blk_col / 16; + parms->incomplete = incomplete; + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->a_count, a, lda, M); + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->b_count, b, ldb, M); + + spes = (spans < HPL_ACCEL_SPES) ? spans : HPL_ACCEL_SPES; + + init_incomplete(incomplete, spes); + + /* Perform a sync in order to ensure that the parameters are written to + * memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + cmd = (c == NULL) ? HPL_ACCEL_CMD_DTRSM : HPL_ACCEL_CMD_DTRSM_PANEL; + + send_cmd_to_spes(cmd, idx, spes); + } else { + /* This function is completely synchronous, therefore, clear incomplete. + */ + if (incomplete) *incomplete = 0; + } + + /* Cleanup any remaining portion of the matrix that was not handled above. + */ + if (n0 < n) { + unsigned int i, x, y; + unsigned long long *a_ptr; + double a_val; + double *b_next; + + a_ptr = (unsigned long long *)a; + if (c) { + /* Perform DTRSM cleanup into a block format matrix row. + */ + for (x=n0; x<(unsigned int)n; x++) { + c[INDEX_BLK(0, x+blk_col, ldc)] = b[INDEX_ROW(0, x, ldb)]; + } + /* y == 1 */ + a_ptr++; + for (i=1; i<(unsigned int)m; i++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + for (x=n0; x<(unsigned int)n; x++) { + c[INDEX_BLK(i, x+blk_col, ldc)] = b[INDEX_ROW(i, x, ldb)] - b[INDEX_ROW(0, x, ldb)] * a_val; + } + } + a_ptr += (lda - m); + + /* y > 1 + */ + for (y=2; y<(unsigned int)m; y++) { + a_ptr += y; + for (i=y; i<(unsigned int)m; i++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + for (x=n0; x<(unsigned int)n; x++) { + c[INDEX_BLK(i, x+blk_col, ldc)] -= c[INDEX_BLK(y-1, x+blk_col, ldc)] * a_val; + } + } + a_ptr += (lda - m); + } + } else { + /* Perform DTRSM cleanup into [b] + */ + for (y=1; y<(unsigned int)m; y++, b+=ldb) { + a_ptr += y; + b_next = b+ldb; + for (i=y; i<(unsigned int)m; i++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + for (x=n0; x<(unsigned int)n; x++) { + b_next[x] -= b[x] * a_val; + } + b_next += ldb; + } + a_ptr += (lda - m); + } + } + } +} + + + +void hpl_accel_dtrsm_CL_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete) +{ + int spes; + int spans; + int n0 = 0; + unsigned int i, x, y; + unsigned int idx; + unsigned int aligned; + volatile hpl_accel_dtrsm_parms_t *parms; + + /* Do as much of the dtrsm as possible using the dtrsm SPU specialist. + * This specialist assumes: + * m is at 128. + * n is a multiple of 16. + * a is quadword aligned. A multiple of 16 for optimal DMA performance + * b is quadword aligned. A multiple of 16 for optimal DMA performance + * lda is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * ldb is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + */ + b += (blk_row * M_SUB) + ldb*(blk_col / M_SUB); + blk_col %= M_SUB; + + aligned = (((unsigned int)(lda | ldb) & 1) | (blk_col & 15) | + ((unsigned int)((uintptr_t)a | (uintptr_t)b) & (16-1))); + + + if ((m == M) && (n > 15) && (aligned == 0)) { + /* Either all or a portion of the computation can be done by the SPE accelerators. + */ + spans = n/16; + n0 = spans * 16; + + /* Verify 4GB boundary expectation. + */ + VALIDATE_PANEL_4GB_CROSSING(a, m, lda); + VALIDATE_MATRIX_4GB_CROSSING(b, m, n0, ldb); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_dtrsm_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->a = a; + parms->b = b; + parms->lda = lda * sizeof(double); + parms->ldb = ldb * sizeof(double); + parms->n = n0; + parms->m = m / M; + parms->blk_col = blk_col / 16; + parms->incomplete = incomplete; + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->a_count, a, lda, M); + + spes = (spans < HPL_ACCEL_SPES) ? spans : HPL_ACCEL_SPES; + + init_incomplete(incomplete, spes); + + /* Perform a sync in order to ensure that the parameters are written to + * memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + send_cmd_to_spes(HPL_ACCEL_CMD_DTRSM_CL_B, idx, spes); + } else { + /* This function is completely synchronous, therefore, clear incomplete. + */ + if (incomplete) *incomplete = 0; + } + + /* Cleanup any remaining portion of the matrix that was not handled above. + */ + for (x=n0; x<(unsigned int)n; x++) { + for (y=1; y<(unsigned int)m; y++) { + for (i=y; i<(unsigned int)m; i++) { + b[INDEX_BLK(i, x+blk_col, ldb)] -= b[INDEX_BLK(y-1, x+blk_col, ldb)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]); + } + } + } +} Index: accel/lib/hpl_accel_global.c =================================================================== RCS file: accel/lib/hpl_accel_global.c diff -N accel/lib/hpl_accel_global.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_global.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,19 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ +#include "hpl_accel_spu.h" +#include "hpl_accel_global.h" + + +/* SPE Thread Info + */ +int hpl_accel_initialized = 0; +hpl_accel_thread_info_t hpl_accel_threads[HPL_ACCEL_SPES]; + + +/* SPE Command Queue + */ +unsigned int hpl_accel_cmd_idx = 0; +hpl_accel_cmd_entry_t hpl_accel_cmd_queue[HPL_ACCEL_CMD_ENTRIES]; + Index: accel/lib/hpl_accel_global.h =================================================================== RCS file: accel/lib/hpl_accel_global.h diff -N accel/lib/hpl_accel_global.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_global.h 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,34 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ +#include +#include +#include "hpl_accel_spu.h" + +#ifndef _HPL_ACCEL_GLOBAL_H_ +#define _HPL_ACCEL_GLOBAL_H_ + +#define HPL_ACCEL_CMD_ENTRIES 8 /* number of command queue entries */ + + +typedef struct hpl_accel_thread_info { + spe_context_ptr_t id; + pthread_t pthread; + spe_spu_control_area_t *ctl_area; // pointer to control ps area + int in_cnt; // inbound mailbox available element count + struct hpl_accel_init_parms *init_parms; +} hpl_accel_thread_info_t; + + +typedef struct hpl_accel_cmd_entry { + unsigned char parms[128] __attribute__ ((aligned (128))); +} hpl_accel_cmd_entry_t; + + +extern int hpl_accel_initialized; +extern hpl_accel_thread_info_t hpl_accel_threads[HPL_ACCEL_SPES]; +extern unsigned int hpl_accel_cmd_idx; +extern hpl_accel_cmd_entry_t hpl_accel_cmd_queue[HPL_ACCEL_CMD_ENTRIES]; + +#endif /* _HPL_ACCEL_GLOBAL_H_ */ Index: accel/lib/hpl_accel_init.c =================================================================== RCS file: accel/lib/hpl_accel_init.c diff -N accel/lib/hpl_accel_init.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_init.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,112 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "hpl_accel_global.h" +#include "hpl_accel_spu.h" + +static hpl_accel_init_parms_t init_parms[HPL_ACCEL_SPES]; + +static void *ppu_pthread_function(void *arg) { + hpl_accel_thread_info_t *info; + unsigned int entry = SPE_DEFAULT_ENTRY; + + info = (hpl_accel_thread_info_t *)arg; + + if (spe_context_run(info->id, &entry, 0, (void *)(info->init_parms), NULL, NULL) < 0) { + perror("Failed running context"); + exit (1); + } + pthread_exit(NULL); +} + +extern spe_program_handle_t hpl_accel_spu; + + +int hpl_accel_init() +{ + int i; + + if (!hpl_accel_initialized) { + + /* Create each of the SPU threads + */ + for (i=0; i +#include +#include "hpl_accel.h" +#include "hpl_accel_spu.h" +#include "ppu_intrinsics.h" + +/* General purpose, reference, reformating facilities. + */ +void hpl_accel_reform_panel_CL_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete) +{ + hpl_ref_reform_panel_CL_to_B(m, n, a, lda, panel, ldp, incomplete); +} + + +void hpl_accel_reform_matrix_CL_to_B(int m, int n, + double *a, int lda, + double *scratch, int size, + unsigned long long *incomplete) + +{ + unsigned int idx; + int spes; + int m_padded; + volatile hpl_accel_reform_matrix_CL_to_B_parms_t *parms; + + m_padded = ((m + M_SUB-1)/M_SUB)*M_SUB; + + /* Assert that the parameter restrictions are not violated. + * n Must be an intregral multiple of 64. + * a Must be quadword aligned. + * lda Must be even and at least roundup(m,64). + * scratch Must be quadword aligned and must not straddle 4GB boundary. + * size Must be at least 64*roundup(m,64). + */ + assert((n % M_SUB) == 0); + assert(lda >= m_padded); + assert(size >= (m_padded-4)*M_SUB); + + /* Assert that the parameters conform also to the desired performance restrictions: + * a Must be cacheline aligned. + * lda Must be a mulitple of 16. + * scratch Must be cacheline aligned. + * size Must be at least 4*64*m for optimal performance. + */ + assert(((uintptr_t)a & (uintptr_t)127) == (uintptr_t)0); + assert(((uintptr_t)scratch & (uintptr_t)127) == (uintptr_t)0); + assert((lda & 15) == 0); + + + /* Verify 4GB boundary expectation. + */ + VALIDATE_PANEL_4GB_CROSSING(scratch, 1, size); + VALIDATE_MATRIX_4GB_CROSSING(a, m, n, lda*M_SUB); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_reform_matrix_CL_to_B_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Compute the number of SPEs to deploy + */ + spes = size / ((m_padded-4) * M_SUB); + if (spes > HPL_ACCEL_SPES) spes = HPL_ACCEL_SPES; + + /* Place the parameters into a command queue buffer + */ + parms->a = a; + parms->scratch = scratch; + parms->lda = lda * sizeof(double); + parms->n = n; + parms->m = m; + parms->spes = spes; + parms->incomplete = incomplete; + + init_incomplete(incomplete, spes); + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_MATRIX_CL_TO_B, idx, spes); +} + + + + + +void hpl_accel_reform_panel_B_to_CL(int m, int n, + double *panel, int ldp, + double *a, int lda, + unsigned long long *incomplete) +{ + unsigned int idx; + volatile hpl_accel_reform_panel_parms_t *parms; + + /* Assert that the parameter restrictions are not violated. + * m Must be an intregral multiple of 64. + * n Must be at least 1. + * panel Must be quadword aligned and buffer may not straddle 4GB boundary. + * ldp Must be at least m. + * a Must be quadword aligned and buffer may not straddle 4GB boundary. + * lda Must be at least m*M_SUB. + */ + assert((m % M_SUB) == 0); + assert(n > 0); + assert(ldp >= m); + assert(lda >= m*M_SUB); + + VALIDATE_PANEL_4GB_CROSSING(panel, n, ldp); + VALIDATE_MATRIX_4GB_CROSSING(a, m, n, lda); + + /* Assert that the parameters conform also to the desired performance restrictions: + * a Mush be cacheline aligned. + * lda Must be a mulitple of 16. + * panel Must be cacheline aligned. + * ldp Must be a multiple of 16. + */ + assert(((uintptr_t)a & (uintptr_t)127) == (uintptr_t)0); + assert(((uintptr_t)panel & (uintptr_t)127) == (uintptr_t)0); + assert((lda & 15) == 0); + assert((ldp & 15) == 0); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_reform_panel_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->n = n; + parms->m = m; + parms->a = a; + parms->lda = lda * sizeof(double); + parms->panel = panel; + parms->ldp = ldp * sizeof(double); + parms->incomplete = incomplete; + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->p_count, panel, ldp, n); + + init_incomplete(incomplete, HPL_ACCEL_REFORM_SPES); + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_PANEL_B_TO_CL, idx, HPL_ACCEL_REFORM_SPES); +} + + +void hpl_accel_reform_panel_R_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete) +{ + unsigned int idx; + volatile hpl_accel_reform_panel_parms_t *parms; + + /* Assert that the parameter restrictions are not violated. + * panel Must be quadword aligned and buffer may not straddle 4GB boundary. + * ldp Must be even at least n. + * a Must be quadword aligned and buffer may not straddle 4GB boundary. + * lda Must be at least m*M_SUB. + */ + assert((ldp & 1) == 0); + assert(ldp >= n); + assert(lda >= m*M_SUB); + + VALIDATE_PANEL_4GB_CROSSING(panel, m, ldp); + VALIDATE_MATRIX_4GB_CROSSING(a, m, n, lda); + + /* Assert that the parameters conform also to the desired performance restrictions: + * a Must be cacheline aligned. + * lda Must be a mulitple of 16. + * panel Must be cacheline aligned. + * ldp Must be a multiple of 16. + */ + assert(((uintptr_t)a & (uintptr_t)127) == (uintptr_t)0); + assert(((uintptr_t)panel & (uintptr_t)127) == (uintptr_t)0); + assert((lda & 15) == 0); + assert((ldp & 15) == 0); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_reform_panel_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->n = n; + parms->m = m; + parms->a = a; + parms->lda = lda * sizeof(double); + parms->panel = panel; + parms->ldp = ldp * sizeof(double); + parms->incomplete = incomplete; + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->p_count, panel, ldp, n); + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_PANEL_R_TO_B, idx, HPL_ACCEL_SPES); +} + + +void hpl_accel_reform_rows_R_to_B(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete) +{ + int i; + unsigned int non_aligned; + int n0 = 0; + + + a += (blk_col/M_SUB) * lda; + blk_col %= M_SUB; + + non_aligned = (((unsigned int)(blk_col | lda | ldr) & 1) | + (((unsigned int)(uintptr_t)a | (uintptr_t)rows) & (16-1))); + + if ((non_aligned == 0) && (n > 1)) { + int m_left; + int rows_per_block; + int *blk_row_ptr; + double *rows_ptr; + unsigned int idx; + volatile hpl_accel_reform_rows_parms_t *parms; + + /* Assert that we won't span a 4G boundary crossing + */ + assert((((uintptr_t)rows) >> 32) == ((uintptr_t)(rows + m*ldr - 1) >> 32)); + + VALIDATE_MATRIX_4GB_CROSSING(a, ((lda/M_SUB) & ~(M_SUB-1)), n, lda); + + n0 = n & ~1; + + idx = hpl_accel_cmd_idx; + + m_left = m; + blk_row_ptr = blk_rows; + rows_ptr = rows; + + /* Generate multiple command requests if the number of rows + * is greater than what will fit in a single command request. + */ + rows_per_block = (int)(sizeof(parms->blk_rows) / sizeof(int)); + + while (m_left > rows_per_block) { + parms = (volatile hpl_accel_reform_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = rows_per_block; + parms->n = n0; + parms->rows = rows_ptr; + parms->ldr = ldr * sizeof(double); + parms->a = a; + parms->lda = lda * sizeof(double); + parms->blk_col = blk_col; + + parms->incomplete = NULL; + for (i=0; iblk_rows[i] = blk_row_ptr[i]; + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_ROWS_R_TO_B, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + m_left -= rows_per_block; + rows_ptr += rows_per_block * ldr; + blk_row_ptr += rows_per_block; + } + + if (m_left > 0) { + parms = (volatile hpl_accel_reform_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = m_left; + parms->n = n0; + parms->rows = rows_ptr; + parms->ldr = ldr * sizeof(double); + parms->a = a; + parms->lda = lda * sizeof(double); + parms->blk_col = blk_col; + + parms->incomplete = incomplete; + for (i=0; iblk_rows[i] = blk_row_ptr[i]; + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_ROWS_R_TO_B, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + } else { + if (incomplete) *incomplete = 0; + } + hpl_accel_cmd_idx = idx; + } else { + if (incomplete) *incomplete = 0; + } + + /* Cleanup portions of the rows not implemented by the SPEs above. + */ + if (n0 < n) { + unsigned int x, y, row; + int first_span, span, left; + double *src, *dst; + + blk_col += n0; + rows += n0; + n -= n0; + + a += (blk_col/M_SUB) * lda; + blk_col %= M_SUB; + + first_span = M_SUB - blk_col; + if (first_span > n) first_span = n; + + /* For each of the rows */ + for (y=0; y<(unsigned int)m; y++) { + row = blk_rows[y]; + left = n; + dst = a + (row * M_SUB); + span = first_span; + left = n - first_span; + + /* For each of the destination buffer block spans + */ + src = rows; + + for (x=0; x<(unsigned int)span; x++) dst[x+blk_col] = src[x]; + while (left) { + dst += lda; + src += span; + span = (left > M_SUB) ? M_SUB : left; + for (x=0; x<(unsigned int)span; x++) dst[x] = src[x]; + left -= span; + } + rows += ldr; + } + } +} + + +void hpl_accel_reform_rows_B_to_R(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete) +{ + int i; + unsigned int non_aligned; + int n0 = 0; + + a += (blk_col/M_SUB) * lda; + blk_col %= M_SUB; + + non_aligned = (((unsigned int)(blk_col | lda | ldr) & 1) | + (((unsigned int)(uintptr_t)a | (uintptr_t)rows) & (16-1))); + + if ((non_aligned == 0) && (n > 1)) { + int m_left; + int rows_per_block; + int *blk_row_ptr; + double *rows_ptr; + unsigned int idx; + volatile hpl_accel_reform_rows_parms_t *parms; + + /* Assert that we won't span a 4G boundary crossing + */ + assert((((uintptr_t)rows) >> 32) == ((uintptr_t)(rows + m*ldr - 1) >> 32)); + + VALIDATE_MATRIX_4GB_CROSSING(a, ((lda/M_SUB) & ~(M_SUB-1)), n, lda); + + n0 = n & ~1; + + idx = hpl_accel_cmd_idx; + + m_left = m; + blk_row_ptr = blk_rows; + rows_ptr = rows; + + /* Generate multiple command requests if the number of rows + * is greater than what will fit in a single command request. + */ + rows_per_block = (int)(sizeof(parms->blk_rows) / sizeof(int)); + + while (m_left > rows_per_block) { + parms = (volatile hpl_accel_reform_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = rows_per_block; + parms->n = n0; + parms->rows = rows_ptr; + parms->ldr = ldr * sizeof(double); + parms->a = a; + parms->lda = lda * sizeof(double); + parms->blk_col = blk_col; + + parms->incomplete = NULL; + for (i=0; iblk_rows[i] = blk_row_ptr[i]; + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_ROWS_B_TO_R, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + m_left -= rows_per_block; + rows_ptr += rows_per_block * ldr; + blk_row_ptr += rows_per_block; + } + + if (m_left > 0) { + parms = (volatile hpl_accel_reform_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = m_left; + parms->n = n0; + parms->rows = rows_ptr; + parms->ldr = ldr * sizeof(double); + parms->a = a; + parms->lda = lda * sizeof(double); + parms->blk_col = blk_col; + + parms->incomplete = incomplete; + for (i=0; iblk_rows[i] = blk_row_ptr[i]; + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_ROWS_B_TO_R, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + } else { + if (incomplete) *incomplete = 0; + } + hpl_accel_cmd_idx = idx; + } else { + if (incomplete) *incomplete = 0; + } + + /* Cleanup portions of the rows not implemented by the SPEs above. + */ + if (n0 < n) { + unsigned int x, y, row; + int first_span, span, left; + double *src, *dst; + + blk_col += n0; + rows += n0; + n -= n0; + + a += (blk_col/M_SUB) * lda; + blk_col %= M_SUB; + + first_span = M_SUB - blk_col; + if (first_span > n) first_span = n; + + /* For each of the rows */ + for (y=0; y<(unsigned int)m; y++) { + row = (unsigned int)blk_rows[y]; + left = n; + src = a + (row * M_SUB); + span = first_span; + left = n - first_span; + + /* For each of the destination buffer block spans + */ + dst = rows; + + for (x=0; x<(unsigned int)span; x++) dst[x] = src[x+(unsigned int)blk_col]; + while (left) { + src += lda; + dst += span; + span = (left > M_SUB) ? M_SUB : left; + for (x=0; x<(unsigned int)span; x++) dst[x] = src[x]; + left -= span; + } + rows += ldr; + } + } +} + Index: accel/lib/hpl_accel_spu.h =================================================================== RCS file: accel/lib/hpl_accel_spu.h diff -N accel/lib/hpl_accel_spu.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_spu.h 23 Oct 2008 21:20:24 -0000 1.12 @@ -0,0 +1,417 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +/* This file contains definitions shared between the PPE and SPE + */ + +#ifndef _HPL_ACCEL_SPU_H_ +#define _HPL_ACCEL_SPU_H_ + +#include "hpl_accel.h" +#include + +#define SUB (2) /* Number of sub-blocks per block (1 dim)*/ +#define M (SUB*M_SUB) /* Size of the matrix block - M x M */ +#define SUB_SUB (SUB*SUB) /* The number of sub-blocks per block */ + + +/* SPE Commands + */ +#define HPL_ACCEL_CMD_DGEMM 0 +#define HPL_ACCEL_CMD_DTRSM 1 +#define HPL_ACCEL_CMD_REFORM_MATRIX_CL_TO_B 2 +#define HPL_ACCEL_CMD_REFORM_PANEL_B_TO_CL 3 +#define HPL_ACCEL_CMD_REFORM_PANEL_R_TO_B 4 +#define HPL_ACCEL_CMD_DGEMM_PANEL 5 +#define HPL_ACCEL_CMD_REFORM_ROWS_R_TO_B 6 +#define HPL_ACCEL_CMD_REFORM_ROWS_B_TO_R 7 +#define HPL_ACCEL_CMD_FINI 8 +#define HPL_ACCEL_CMD_DTRSM_CL_B 9 +#define HPL_ACCEL_CMD_DTRSM_PANEL 10 +#define HPL_ACCEL_CMD_DGEMM_C_C_C 11 +#define HPL_ACCEL_CMD_SWAP_ROWS_B_TO_B 12 +#define HPL_ACCEL_CMD_COPY_ROWS_R_TO_R 13 + + +#define HPL_ACCEL_CMD_MASK 0x7F + +#define HPL_ACCEL_SPES 8 /* # of SPEs to use per accelerator */ +#define HPL_ACCEL_REFORM_SPES 4 /* # of SPEs to use during some reformat */ +#define HPL_ACCEL_PARM_TAG 31 + +/* Function parameters */ + +#ifdef __SPU__ +#include + +typedef struct hpl_accel_init_parms { + unsigned int id __attribute__ ((aligned (16))); + unsigned long long cmd_base __attribute__ ((aligned (16)));; +} hpl_accel_init_parms_t; + +typedef struct hpl_accel_dgemm_parms { + unsigned long long a __attribute__ ((aligned (16))); + unsigned long long b __attribute__ ((aligned (16))); + unsigned long long c __attribute__ ((aligned (16))); + unsigned long long p __attribute__ ((aligned (16))); + vec_uint4 ld; /* lda, ldb, ldc, ldp */ + vec_uint4 dim; /* n, m, k, pad */ + vec_uint4 flags; /* b_blk, a_count, b_count, p_count */ + unsigned long long incomplete __attribute__ ((aligned (16))); +} hpl_accel_dgemm_parms_t; + +typedef struct hpl_accel_dtrsm_parms { + unsigned long long a __attribute__ ((aligned (16))); + unsigned long long b __attribute__ ((aligned (16))); + unsigned long long c __attribute__ ((aligned (16))); + vec_uint4 ld; /* lda, ldb, ldc, pad */ + vec_uint4 dim; /* n, m, a_count, b_count */ + vec_uint4 blk_col; + unsigned long long incomplete __attribute__ ((aligned (16))); +} hpl_accel_dtrsm_parms_t; + +typedef struct hpl_accel_reform_matrix_CL_to_B_parms { + unsigned long long a __attribute__ ((aligned (16))); + unsigned long long scratch __attribute__ ((aligned (16))); + int lda __attribute__ ((aligned (16))); + int n __attribute__ ((aligned (16))); + int m __attribute__ ((aligned (16))); + int spes __attribute__ ((aligned (16))); + unsigned long long incomplete __attribute__ ((aligned (16))); +} hpl_accel_reform_matrix_CL_to_B_parms_t; + +typedef struct hpl_accel_reform_panel_parms { + unsigned long long a __attribute__ ((aligned (16))); + unsigned long long panel __attribute__ ((aligned (16))); + int lda __attribute__ ((aligned (16))); + int ldp __attribute__ ((aligned (16))); + int n __attribute__ ((aligned (16))); + int m __attribute__ ((aligned (16))); + int p_count __attribute__ ((aligned (16))); + unsigned long long incomplete __attribute__ ((aligned (16))); +} hpl_accel_reform_panel_parms_t; + +typedef struct hpl_accel_reform_rows_parms { + vector signed int m_n_ldr_lda; + vector unsigned long long rows_a; + vector unsigned long long incomplete_blk_col; + int blk_rows[5*4]; +} hpl_accel_reform_rows_parms_t; + +typedef struct hpl_accel_swap_rows_parms { + vector signed int m_n_lda_blk_col __attribute__ ((aligned (16))); + vector unsigned long long a_incomplete __attribute__ ((aligned (16))); + int blk_rows[6*4]; +} hpl_accel_swap_rows_parms_t; + +typedef struct hpl_accel_copy_rows_parms { + vector signed int m_n_lda_ldb __attribute__ ((aligned (16))); + vector unsigned long long a_b __attribute__ ((aligned (16))); + vector unsigned long long incomplete_pad __attribute__ ((aligned (16))); + int rows[4*4]; +} hpl_accel_copy_rows_parms_t; + +#else + +typedef struct hpl_accel_init_parms { + unsigned int id __attribute__ ((aligned (16))); + void *cmd_base __attribute__ ((aligned (16))); + void *signotify1[HPL_ACCEL_SPES] __attribute__ ((aligned (16))); +} hpl_accel_init_parms_t; + +typedef struct hpl_accel_dgemm_parms { + const double *a __attribute__ ((aligned (16))); + const double *b __attribute__ ((aligned (16))); + double *c __attribute__ ((aligned (16))); + double *p __attribute__ ((aligned (16))); + int lda __attribute__ ((aligned (16))); + int ldb; + int ldc; + int ldp; + int n __attribute__ ((aligned (16))); + int m; + int k; + int b_blk __attribute__ ((aligned (16))); + int a_count; + int b_count; + int p_count; + unsigned long long *incomplete __attribute__ ((aligned (16))); +} hpl_accel_dgemm_parms_t; + + +typedef struct hpl_accel_dtrsm_parms { + const double *a __attribute__ ((aligned (16))); + double *b __attribute__ ((aligned (16))); + double *c __attribute__ ((aligned (16))); + int lda __attribute__ ((aligned (16))); + int ldb; + int ldc; + int n __attribute__ ((aligned (16))); + int m; + int a_count; + int b_count; + unsigned int blk_col __attribute__ ((aligned (16))); + unsigned long long *incomplete __attribute__ ((aligned (16))); +} hpl_accel_dtrsm_parms_t; + +typedef struct hpl_accel_reform_matrix_CL_to_B_parms { + double *a __attribute__ ((aligned (16))); + double *scratch __attribute__ ((aligned (16))); + int lda __attribute__ ((aligned (16))); + int n __attribute__ ((aligned (16))); + int m __attribute__ ((aligned (16)));; + int spes __attribute__ ((aligned (16))); + unsigned long long *incomplete __attribute__ ((aligned (16))); +} hpl_accel_reform_matrix_CL_to_B_parms_t; + +typedef struct hpl_accel_reform_panel_parms { + double *a __attribute__ ((aligned (16))); + double *panel __attribute__ ((aligned (16))); + int lda __attribute__ ((aligned (16))); + int ldp __attribute__ ((aligned (16))); + int n __attribute__ ((aligned (16))); + int m __attribute__ ((aligned (16))); + int p_count __attribute__ ((aligned (16))); + unsigned long long *incomplete __attribute__ ((aligned (16))); +} hpl_accel_reform_panel_parms_t; + +typedef struct hpl_accel_reform_rows_parms { + int m, n, ldr, lda; + double *rows, *a; + unsigned long long *incomplete; + int blk_col, pad; + int blk_rows[5*4]; +} hpl_accel_reform_rows_parms_t; + +typedef struct hpl_accel_swap_rows_parms { + int m, n, lda, blk_col; + double *a; + unsigned long long *incomplete; + int blk_rows[6*4]; +} hpl_accel_swap_rows_parms_t; + +typedef struct hpl_accel_copy_rows_parms { + int m, n, lda, ldb; + double *a; + double *b; + unsigned long long *incomplete; + unsigned long long pad; + int rows[4*4]; +} hpl_accel_copy_rows_parms_t; + +#endif + + +/* Inline support functions. + */ +#ifdef __PPU__ + +#include +#include "hpl_accel_global.h" + + +/* init_incomplete + * --------------- + * Initialize the asynchronous completion notification variable according + * to the specified number of paraticants. The number of participants can + * be between 1 and 8 where each byte in the unsigned long long variable + * is a flag for each of the participants. The bytes are assigned as follows: + * + * msb lsb + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | SPE 0 | SPE 1 | SPE 2 | SPE 3 | SPE 4 | SPE 5 | SPE 6 | SPE 7 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Parameters: + * incomplete Pointer to the asynchronous completion variable. + * + * participants Number of participants that will participate in the + * command that need to acknowledge completion status. + * + */ +static inline void init_incomplete(unsigned long long *incomplete, int participants) +{ + if (incomplete) { + *incomplete = 0xFFFFFFFFFFFFFFFFULL << (8*(8-participants)); + } +} + + +/* send_cmd_to_spes + * ---------------- + * Send the command with the index to the parameter buffer to the specified + * number of SPE participants. The command and index are combined into a + * 32-bit message that is placed in the inbound SPE mailbox. The 7 least + * significant bits of the message contain the command id. The 25 most + * significant bits is an offset from the cmd_base to the cacheline containing + * the command paramters. + * + * Parameters: + * idx Command buffer index that contains the parameters for + * this command. + * + * participants Number of participants that will participate in the command. + */ + +static inline void send_cmd_to_spes(unsigned int cmd, int idx, int participants) +{ + int i; + int cnt; + volatile spe_spu_control_area_t *ctl; + + /* Construct cmd message to be sent to each of the SPEs via the + * inbound mailbox. + */ + cmd |= (unsigned int)(idx * sizeof(struct hpl_accel_cmd_entry)); + + for (i=0; iSPU_Mbox_Stat >> 8) & 0xFF; + } + + /* Place the command into the inbound mailbox. + */ + ctl->SPU_In_Mbox = cmd; + hpl_accel_threads[i].in_cnt = cnt-1; + } +} +#endif + + +#ifdef PANEL_4GB_CROSSING +#define COMPUTE_PANEL_4GB_CROSSING_COUNT(_count, _panel, _ld, _max) { \ + int _cnt; \ + /* Calculate the number of rows/columns to the 4GB crossing and clamp \ + * the result to max. \ + */ \ + _cnt = (0x20000000 - ((unsigned int)(uintptr_t)_panel) / sizeof(double)) / _ld; \ + _count = (_cnt > _max) ? _max : _cnt; \ +} +#else /* !PANEL_4GB_CROSSING */ +#define COMPUTE_PANEL_4GB_CROSSING_COUNT(_count, _panel, _ld, _max) +#endif /* PANEL_4GB_CROSSING */ + +#define COUNT_PANEL_4GB_CROSSINGS(_p, _n, _ld) \ + /* return the number of 4GB crossings in panel _p */ \ + (((uintptr_t)(_p + _n*_ld - 1) >> 32) - ((uintptr_t)_p >> 32)) + +#ifdef VALIDATE_4GB_CROSSING +#ifdef PANEL_4GB_CROSSING + +#define VALIDATE_PANEL_4GB_CROSSING(_p, _n, _ld) { \ + /* Verify that if the panel crosses a 4GB boundary. It does so only on a row \ + * boundary, and only once. \ + */ \ + if (_p) { \ + unsigned int _crossings; \ + unsigned int _bytes_til_crossing; \ + \ + _crossings = COUNT_PANEL_4GB_CROSSINGS(_p, _n, _ld); \ + switch (_crossings) { \ + case 0: \ + break; \ + case 1: \ + _bytes_til_crossing = ((uintptr_t)_p ^ (-1)) + 1; \ + if ((_bytes_til_crossing % (_ld * sizeof(double))) != 0) { \ + fprintf(stderr, "%s %d - Panel crosses 4GB boundary within a row/col. Parameters p=%p n=%d ld=%d\n",\ + __PRETTY_FUNCTION__, __LINE__, _p, _n, _ld); \ + abort(); \ + } \ + break; \ + default: \ + fprintf(stderr, "%s %d - Panel crosses %d 4GB boundary. Parameters p=%p n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _crossings, _p, _n, _ld); \ + abort(); \ + break; \ + } \ + if (_ld > 0x0FFFFFFF) { \ + fprintf(stderr, "%s %d - Panel leading dimension too big. Parameters p=%p n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _p, _n, _ld); \ + abort(); \ + } \ + } \ +} + +#else /* ! PANEL_4GB_CROSSING */ + +#define VALIDATE_PANEL_4GB_CROSSING(_p, _n, _ld) { \ + /* Verify that the panel does not cross a 4GB boundary */ \ + if (_p) { \ + if ( COUNT_PANEL_4GB_CROSSINGS(_p, _n, _ld) != 0 ) { \ + fprintf(stderr, "%s %d - Panel crosses 4GB boundary unexpectedly. Parameters p=%p n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _p, _n, _ld); \ + abort(); \ + } \ + if (_ld > 0x0FFFFFFF) { \ + fprintf(stderr, "%s %d - Panel leading dimension too big. Parameters p=%p n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _p, _n, _ld); \ + abort(); \ + } \ + } \ +} +#endif + +#ifdef MATRIX_4GB_CROSSING + +#define VALIDATE_MATRIX_4GB_CROSSING(_p, _m, _n, _ld) { \ + if (_p) { \ + int _i; \ + double *_start, *_end; \ + unsigned int _blks_per_col, _dbls_to_crossing; \ + \ + if (_ld > 0x0FFFFFFF) { \ + fprintf(stderr, "%s %d - Matrix leading dimension too big. Parameters p=%p m=%d n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _p, _m, _n, _ld); \ + abort(); \ + } \ + /* For each column of blocks */ \ + _blks_per_col = (_m + (M_SUB-1))/M_SUB; \ + for (_i=0, _start=(double *)_p; _i<_n; _i+=M_SUB) { \ + _end = _start + _ld; \ + if (((uintptr_t)(_end) >> 32) > ((uintptr_t)(_start) >> 32)) { \ + /* This column crosses a 4GB boundary. Check to see that it occurs only on a block boundary */ \ + _dbls_to_crossing = 0x20000000 - ((unsigned int)(uintptr_t)_start) / sizeof(double); \ + if (((M_SUB*M_SUB)*_blks_per_col > _dbls_to_crossing) && \ + ((_dbls_to_crossing % (M_SUB*M_SUB)) != 0)) { \ + fprintf(stderr, "%s %d - Matrix block straddles 4GB boundary. Parameters p=%p m=%d n=%d ld=%d\n",\ + __PRETTY_FUNCTION__, __LINE__, _p, _m, _n, _ld); \ + abort(); \ + } \ + } \ + _start = _end; \ + } \ + } \ +} + +#else /* !MATRIX_4GB_CROSSING */ + +#define VALIDATE_MATRIX_4GB_CROSSING(_p, _m, _n, _ld) { \ + if (_p) { \ + if ((((uintptr_t)_p) >> 32) != ((uintptr_t)(_p + _ld * (((_n+M_SUB-1)/M_SUB)-1) + ((_m+M_SUB-1)/M_SUB)*M_SUB*M_SUB-1) >> 32)) { \ + fprintf(stderr, "%s %d - Matrix crosses 4GB boundary unexpectedly. Parameters p=%p m=%d n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _p, _m, _n, _ld); \ + abort(); \ + } \ + if (_ld > 0x0FFFFFFF) { \ + fprintf(stderr, "%s %d - Matrix leading dimension too big. Parameters p=%p m=%d n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _p, _m, _n, _ld); \ + abort(); \ + } \ + } \ +} +#endif + +#else /* VALIDATE_4GB_CROSSING */ +#define VALIDATE_PANEL_4GB_CROSSING(_p, _n, _ld) +#define VALIDATE_MATRIX_4GB_CROSSING(_p, _m, _n, _ld) +#endif /* VALIDATE_4GB_CROSSING */ + +#endif /* _HPL_ACCEL_SPU_H_ */ Index: accel/lib/hpl_accel_swap.c =================================================================== RCS file: accel/lib/hpl_accel_swap.c diff -N accel/lib/hpl_accel_swap.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_swap.c 20 Aug 2008 03:57:53 -0000 1.4 @@ -0,0 +1,150 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include "hpl_accel.h" +#include "hpl_accel_spu.h" +#include "ppu_intrinsics.h" + +/* General purpose, reference, reformating facilities. + */ + +void hpl_accel_swap_rows_B_to_B(int m, int n, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete) +{ + int i; + unsigned int non_aligned; + int n0 = 0; + + a += (blk_col/M_SUB) * lda; + blk_col %= M_SUB; + + non_aligned = (((unsigned int)(blk_col | lda) & 1) | + (((unsigned int)(uintptr_t)a) & (16-1))); + + if ((non_aligned == 0) && (n > 1)) { + int m_start, m_left; + int rows_per_block; + unsigned int idx; + volatile hpl_accel_swap_rows_parms_t *parms; + + /* Assert that we won't span a 4G boundary crossing + */ + VALIDATE_MATRIX_4GB_CROSSING(a, ((lda/M_SUB) & ~(M_SUB-1)), n, lda); + + n0 = n & ~1; + + idx = hpl_accel_cmd_idx; + + m_start = 0; + m_left = m; + + /* Generate multiple command requests if the number of rows + * is greater than what will fit in a single command request. + */ + rows_per_block = (int)(sizeof(parms->blk_rows) / sizeof(int)); + + while (m_left > rows_per_block) { + parms = (volatile hpl_accel_swap_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = rows_per_block; + parms->n = n0; + parms->lda = lda * sizeof(double); + parms->blk_col = blk_col; + + parms->a = a + INDEX_BLK(m_start,0,lda); + parms->incomplete = NULL; + + for (i=0; iblk_rows[i] = blk_rows[m_start+i]-m_start; + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_SWAP_ROWS_B_TO_B, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + m_start += rows_per_block; + m_left -= rows_per_block; + } + + if (m_left > 0) { + parms = (volatile hpl_accel_swap_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = m_left; + parms->n = n0; + parms->lda = lda * sizeof(double); + parms->blk_col = blk_col; + + parms->a = a + INDEX_BLK(m_start,0,lda); + parms->incomplete = incomplete; + + for (i=0; iblk_rows[i] = blk_rows[m_start+i]-m_start; + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_SWAP_ROWS_B_TO_B, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + } else { + if (incomplete) *incomplete = 0; + } + hpl_accel_cmd_idx = idx; + } else { + if (incomplete) *incomplete = 0; + } + + /* Cleanup portions of the rows not implemented by the SPEs above. + */ + if (n0 < n) { + unsigned int y1, y2, x; + int first_span, span, left; + double tmp, *src, *dst; + + blk_col += n0; + n -= n0; + + a += (blk_col/M_SUB) * lda; + blk_col %= M_SUB; + + first_span = M_SUB - blk_col; + if (first_span > n) first_span = n; + + /* For each of the rows */ + for (y1=0; y1<(unsigned int)m; y1++) { + y2 = blk_rows[y1]; /* New location for row y1 */ + if (y1 != y2) { + dst = a + (y1 * M_SUB); + src = a + (y2 * M_SUB); + for (x=0; x<(unsigned int)first_span; x++) + {tmp = dst[x+blk_col]; dst[x+blk_col] = src[x+blk_col]; src[x+blk_col] = tmp;} + left = n - first_span; + while (left) { + dst += lda; + src += lda; + span = (left > M_SUB) ? M_SUB : left; + for (x=0; x<(unsigned int)span; x++) + {tmp = dst[x]; dst[x] = src[x]; src[x] = tmp;} + left -= span; + } + } + } + } +} + Index: accel/lib/hpl_ref.c =================================================================== RCS file: accel/lib/hpl_ref.c diff -N accel/lib/hpl_ref.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_ref.c 20 Aug 2008 03:57:53 -0000 1.11 @@ -0,0 +1,419 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include "hpl_accel.h" +#include "hpl_accel_spu.h" + +#include + + +int hpl_ref_init() +{ + return HPL_ACCEL_INIT_SUCCESS; +} + + +void hpl_ref_dgemm_CL_R_B(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete) +{ + unsigned int i; + unsigned int x, y; + double a_val; + + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]); + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_BLK(y,x,ldc)] -= a_val * b[INDEX_ROW(i,x,ldb)]; + } + } + } + + if (incomplete) *incomplete = 0; +} + + +void hpl_ref_dgemm_CL_R_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *p, int ldp, + unsigned long long *incomplete) +{ + unsigned int i; + unsigned int x, y; + double a_val; + + if (p) { + /* Copy c into p */ + for (y=0; y<(unsigned int)m; y++) { + for (x=0; x<(unsigned int)n; x++) { + p[INDEX_COL(y,x,ldp)] = c[INDEX_BLK(y+blk_row, x+blk_col, ldc)]; + } + } + /* Perform DGEMM on p */ + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]); + for (x=0; x<(unsigned int)n; x++) { + p[INDEX_COL(y,x,ldp)] -= a_val * b[INDEX_ROW(i,x,ldb)]; + } + } + } + /* Byte swap the result */ + for (y=0; y<(unsigned int)m; y++) { + for (x=0; x<(unsigned int)n; x++) { + p[INDEX_COL(y,x,ldp)] = hpl_accel_byte_swap(p[INDEX_COL(y,x,ldp)]); + } + } + } else { + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]); + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_BLK(y+blk_row, x+blk_col, ldc)] -= a_val * b[INDEX_ROW(i,x,ldb)]; + } + } + } + } + if (incomplete) *incomplete = 0; +} + + +void hpl_ref_dgemm_CL_B_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *p, int ldp, + unsigned long long *incomplete) +{ + unsigned int i; + unsigned int x, y; + double a_val; + + if (p) { + /* Copy c into p */ + for (y=0; y<(unsigned int)m; y++) { + for (x=0; x<(unsigned int)n; x++) { + p[INDEX_COL(y,x,ldp)] = c[INDEX_BLK(y+blk_row, x+blk_col, ldc)]; + } + } + /* Perform DGEMM on P */ + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]); + for (x=0; x<(unsigned int)n; x++) { + p[INDEX_COL(y,x,ldp)] -= a_val * b[INDEX_BLK(i,x+blk_col,ldb)]; + } + } + } + /* Byte swap the result */ + for (y=0; y<(unsigned int)m; y++) { + for (x=0; x<(unsigned int)n; x++) { + p[INDEX_COL(y,x,ldp)] = hpl_accel_byte_swap(p[INDEX_COL(y,x,ldp)]); + } + } + } else { + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]); + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_BLK(y+blk_row, x+blk_col, ldc)] -= a_val * b[INDEX_BLK(i,x+blk_col,ldb)]; + } + } + } + } + if (incomplete) *incomplete = 0; +} + + + +void hpl_ref_dgemm_CL_B_B(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete) +{ + unsigned int i; + unsigned int x, y; + double a_val; + + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]); + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_BLK(y,x,ldc)] -= a_val * b[INDEX_BLK(i,x,ldb)]; + } + } + } + + if (incomplete) *incomplete = 0; +} + + +extern void hpl_ref_dgemm_C_C_C(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete) +{ + unsigned int i; + unsigned int x, y; + double a_val, c_val; + + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = a[INDEX_COL(y,i,lda)]; + for (x=0; x<(unsigned int)n; x++) { + c_val = c[INDEX_COL(y,x,ldc)]; + c_val -= a_val * b[INDEX_COL(i,x,ldb)]; + c[INDEX_COL(y,x,ldc)] = c_val; + } + } + } + + if (incomplete) *incomplete = 0; +} + + + +void hpl_ref_dtrsm_CL_R(int m, int n, + const double *a, int lda, + double *b, int ldb, + unsigned long long *incomplete) +{ + unsigned int i, x, y; + + for (x=0; x<(unsigned int)n; x++) { + for (y=1; y<(unsigned int)m; y++) { + for (i=y; i<(unsigned int)m; i++) { + b[INDEX_ROW(i, x, ldb)] -= b[INDEX_ROW(y-1, x, ldb)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]); + } + } + } + if (incomplete) *incomplete = 0; +} + + +void hpl_ref_dtrsm_CL_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete) +{ + unsigned int i, x, y; + + for (x=0; x<(unsigned int)n; x++) { + for (y=1; y<(unsigned int)m; y++) { + for (i=y; i<(unsigned int)m; i++) { + b[INDEX_BLK(i+blk_row, x+blk_col, ldb)] -= b[INDEX_BLK(y-1+blk_row, x+blk_col, ldb)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]); + } + } + } + if (incomplete) *incomplete = 0; +} + + +void hpl_ref_dtrsm_CL_R_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete) +{ + unsigned int i, x, y; + + if (c) { + for (x=0; x<(unsigned int)n; x++) { + + for (i=0; i<(unsigned int)m; i++) c[INDEX_BLK(i+blk_row, x+blk_col, ldc)] = b[INDEX_ROW(i, x, ldb)]; /* Copy the column of b into c */ + for (y=1; y<(unsigned int)m; y++) { + for (i=y; i<(unsigned int)m; i++) { + c[INDEX_BLK(i+blk_row, x+blk_col, ldc)] -= c[INDEX_BLK(y-1+blk_row, x+blk_col, ldc)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]); + } + } + } + } else { + for (x=0; x<(unsigned int)n; x++) { + for (y=1; y<(unsigned int)m; y++) { + for (i=y; i<(unsigned int)m; i++) { + b[INDEX_ROW(i, x, ldb)] -= b[INDEX_ROW(y-1, x, ldb)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]); + } + } + } + } + if (incomplete) *incomplete = 0; +} + + + + +/* General purpose, reference, reformating facilities. + */ +void hpl_ref_reform_panel_CL_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete) +{ + unsigned int x, y; + + for (x=0; x<(unsigned int)n; x++) { + for (y=0; y<(unsigned int)m; y++) { + a[INDEX_BLK(y,x,lda)] = hpl_accel_byte_swap(panel[INDEX_COL(y,x,ldp)]); + } + } + + if (incomplete) *incomplete = 0; +} + + +void hpl_ref_reform_matrix_CL_to_B(int m, int n, + double *a, int lda, + double *scratch, + int size __attribute__ ((unused)) , + unsigned long long *incomplete) + +{ + unsigned int i; + unsigned int x, y; + unsigned int col; + + /* Reformat the matrix [a] from column-order, little-endian to blocked, + * big-endian format. + */ + + /* For each column of blocks */ + for (col=0; col<(unsigned int)n; col+=M_SUB) { + /* Reformat the column of block into the scratch buffer */ + for (x=0; x<(unsigned int)M_SUB; x++) { + for (y=0; y<(unsigned int)m; y++) { + scratch[INDEX_ROW(y,x,M_SUB)] = hpl_accel_byte_swap(a[INDEX_COL(y,x,lda)]); + } + } + /* Copy the reformated data back into a */ + memcpy(a, scratch, sizeof(double)*M_SUB*m); + + /* Zero the trailing block column of data */ + a += M_SUB*m; + for (i=0; i<(unsigned int)M_SUB*(lda-m); i++) *a++ = 0.0; + } + if (incomplete) *incomplete = 0; +} + + + + +void hpl_ref_reform_panel_B_to_CL(int m, int n, + double *panel, int ldp, + double *a, int lda, + unsigned long long *incomplete) +{ + unsigned int x, y; + + for (x=0; x<(unsigned int)n; x++) { + for (y=0; y<(unsigned int)m; y++) { + panel[INDEX_COL(y,x,ldp)] = hpl_accel_byte_swap(a[INDEX_BLK(y,x,lda)]); + } + } + if (incomplete) *incomplete = 0; +} + + + +void hpl_ref_reform_panel_R_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete) +{ + unsigned int x, y; + + for (x=0; x<(unsigned int)n; x++) { + for (y=0; y<(unsigned int)m; y++) { + a[INDEX_BLK(y,x,lda)] = panel[INDEX_ROW(y,x,ldp)]; + } + } + if (incomplete) *incomplete = 0; +} + + +void hpl_ref_reform_rows_R_to_B(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete) +{ + unsigned int x, y; + + for (y=0; y<(unsigned int)m; y++) { + for (x=0; x<(unsigned int)n; x++) { + a[INDEX_BLK((unsigned int)blk_rows[y], blk_col+x, lda)] = rows[INDEX_ROW(y, x, ldr)]; + } + } + if (incomplete) *incomplete = 0; +} + + + +void hpl_ref_reform_rows_B_to_R(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete) + +{ + unsigned int x, y; + + for (y=0; y<(unsigned int)m; y++) { + for (x=0; x<(unsigned int)n; x++) { + rows[INDEX_ROW(y, x, ldr)] = a[INDEX_BLK((unsigned int)blk_rows[y], blk_col+x, lda)]; + } + } + if (incomplete) *incomplete = 0; +} + +void hpl_ref_swap_rows_B_to_B(int m, int n, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete) +{ + unsigned int y1, y2, x; + + for (y1=0; y1<(unsigned int)m; y1++) { + y2 = blk_rows[y1]; /* New location for row y1 */ + if (y1 != y2) { + /* Swap rows y1 and y2 */ + for (x=0; x<(unsigned int)n; x++) { + double tmp = a[INDEX_BLK(y1, x+blk_col, lda)]; + a[INDEX_BLK(y1, x+blk_col, lda)] = a[INDEX_BLK(y2, x+blk_col, lda)]; + a[INDEX_BLK(y2, x+blk_col, lda)] = tmp; + } + } + } + if (incomplete) *incomplete = 0; +} + +void hpl_ref_copy_rows_R_to_R(int m, int n, + double *a, int lda, + double *b, int ldb, + int *rows, + unsigned long long *incomplete) +{ + unsigned int y1, y2, x; + + for (y1=0; y1<(unsigned int)m; y1++) { + y2 = rows[y1]; /* New location for row y1 */ + /* Copy row a[y1] to b[y2] */ + for (x=0; x<(unsigned int)n; x++) { + b[INDEX_ROW(y2, x, ldb)] = a[INDEX_ROW(y1, x, lda)]; + } + } + if (incomplete) *incomplete = 0; +} Index: accel/lib/spu/Makefile =================================================================== RCS file: accel/lib/spu/Makefile diff -N accel/lib/spu/Makefile --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/Makefile 20 Aug 2008 03:57:53 -0000 1.9 @@ -0,0 +1,57 @@ +# --------------------------------------------------------------- +# (C) Copyright IBM Corporation 2007,2008 +# +# --------------------------------------------------------------- + +######################################################################## +# Target +######################################################################## + +PROGRAM_spu := hpl_accel_spu + +LIBRARY_embed64 = libhpl_accel_spu.a + +OBJS = hpl_accel_spu.o \ + accel_dgemm.o \ + accel_dgemm_panel.o \ + accel_dgemm_C.o \ + accel_dtrsm.o \ + accel_dtrsm_panel.o \ + accel_dtrsm_CL_B.o \ + accel_reform_matrix_CL_to_B.o \ + accel_reform_panel_B_to_CL.o \ + accel_reform_panel_R_to_B.o \ + accel_reform_rows_B_to_R.o \ + accel_reform_rows_R_to_B.o \ + accel_swap_rows_B_to_B.o \ + accel_copy_rows_R_to_R.o \ + accel_buffers.o \ + accel_mm_dp_64Cx64.o \ + accel_dtrsm_dp_128Cx16.o \ + accel_mm_dp.o + + +######################################################################## +# Local Defines +######################################################################## + +# CC_OPT_LEVEL = -g + +#CPPFLAGS = -DACCEL_LITTLE_ENDIAN +CPPFLAGS += -DMATRIX_4GB_CROSSING + +# THE SPU CODE DOES NOT YET SUPPORT 4GB PANEL CROSSING +#CPPFLAGS += -DPANEL_4GB_CROSSING + +CFLAGS_gcc = -march=celledp -mtune=celledp +CFLAGS_xlc = -qarch=edp -qtune=edp + +INCLUDE = -I.. + + + +######################################################################## +# make.footer +######################################################################## + +include $(CELL_TOP)/buildutils/make.footer Index: accel/lib/spu/accel_buffers.S =================================================================== RCS file: accel/lib/spu/accel_buffers.S diff -N accel/lib/spu/accel_buffers.S --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_buffers.S 23 Oct 2008 21:20:24 -0000 1.3 @@ -0,0 +1,24 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + + .data + .align 7 + .global bufA +bufA: + .global bufA_128x128 +bufA_128x128: + .skip 2*64*64*8 + .global bufB +bufB: .skip 2*64*64*8 + + .global bufC +bufC: + .global bufB_128x16 +bufB_128x16: + .skip 2*128*16*8 + .global bufB_list +bufB_list: + .skip 64*64*8 + Index: accel/lib/spu/accel_buffers.h =================================================================== RCS file: accel/lib/spu/accel_buffers.h diff -N accel/lib/spu/accel_buffers.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_buffers.h 20 Aug 2008 03:57:53 -0000 1.2 @@ -0,0 +1,24 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007 */ +/* */ +/* ---------------------------------------------------------------- */ + +#ifndef _ACCEL_BUFFERS_H_ +#define _ACCEL_BUFFERS_H_ + +#include + +/* The local store buffers is carved up uniquely for each acceleration function. + */ + +/* DGEMM buffer set */ +extern vec_double2 bufA[2][64*64/2]; +extern vec_double2 bufB[2][64*64/2]; +extern vec_double2 bufC[2][64*64/2]; + +/* DTRSM buffer set */ +extern vec_double2 bufA_128x128[128*128/2]; +extern vec_double2 bufB_128x16[2][128*16/2]; +extern vec_uint4 bufB_list[8][128/2]; + +#endif /* _ACCEL_BUFFERS_H_ */ Index: accel/lib/spu/accel_copy_rows_R_to_R.c =================================================================== RCS file: accel/lib/spu/accel_copy_rows_R_to_R.c diff -N accel/lib/spu/accel_copy_rows_R_to_R.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_copy_rows_R_to_R.c 20 Aug 2008 03:57:53 -0000 1.4 @@ -0,0 +1,127 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_reform.h" + + +static inline void row_R_to_R(unsigned int src_hi, unsigned int src_lo, + unsigned int dst_hi, unsigned int dst_lo, + unsigned int left, unsigned int *tag) +{ + void *buf[2]; + unsigned int size; + + buf[0] = bufA; + buf[1] = bufB; + + size = 16*1024; + if (size > left) size = left; + + spu_mfcdma64(buf[*tag], src_hi, src_lo, size, *tag, MFC_GETB_CMD); + left -= size; + + while (left) { + + spu_mfcdma64(buf[*tag], dst_hi, dst_lo, size, *tag, MFC_PUTB_CMD); + + *tag ^= 1; + + /* increment src_hi, src_lo, dst_hi, dst_lo */ + MATRIX_EA_UADD32(src_hi, src_lo, size); + MATRIX_EA_UADD32(dst_hi, dst_lo, size); + + size = 16*1024; + if (size > left) size = left; + + spu_mfcdma64(buf[*tag], src_hi, src_lo, size, *tag, MFC_GETB_CMD); + left -= size; + } + + spu_mfcdma64(buf[*tag], dst_hi, dst_lo, size, *tag, MFC_PUTB_CMD); +} + + +void accel_copy_rows_R_to_R(hpl_accel_init_parms_t *parms, + volatile hpl_accel_copy_rows_parms_t *cmd_parms) +{ + int m, n, lda, ldb; + unsigned int src, dst; + unsigned int id; + unsigned int a_hi, a_lo; + unsigned int b_hi, b_lo; + unsigned int src_hi, src_lo; + unsigned int dst_hi, dst_lo; + unsigned int row_size; + unsigned int tag; + unsigned int rows_per_spe, extra_rows, start_row, end_row; + vector signed int m_n_lda_ldb; + vector unsigned long long a_b, incomplete_pad; + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the parameters + */ + m_n_lda_ldb = cmd_parms->m_n_lda_ldb; + a_b = cmd_parms->a_b; + incomplete_pad = cmd_parms->incomplete_pad; + + m = spu_extract(m_n_lda_ldb, 0); + n = spu_extract(m_n_lda_ldb, 1); + lda = spu_extract(m_n_lda_ldb, 2); + ldb = spu_extract(m_n_lda_ldb, 3); + + a_hi = spu_extract((vector unsigned int)a_b, 0); + a_lo = spu_extract((vector unsigned int)a_b, 1); + + b_hi = spu_extract((vector unsigned int)a_b, 2); + b_lo = spu_extract((vector unsigned int)a_b, 3); + + /* Process rows by assigning each row to one SPE. + */ + row_size = n*sizeof(double); + rows_per_spe = m / HPL_ACCEL_SPES; + extra_rows = m % HPL_ACCEL_SPES; + + start_row = id * rows_per_spe + ((id > extra_rows) ? extra_rows : id); + end_row = start_row + rows_per_spe - spu_extract(spu_cmpgt(spu_promote(extra_rows, 0), spu_promote(id, 0)), 0); + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + + tag = 0; + for (src=start_row; srcrows[src]; + + src_hi = a_hi; src_lo = a_lo; + MATRIX_EA_UADD32(src_hi, src_lo, src*lda); + + dst_hi = b_hi; dst_lo = b_lo; + MATRIX_EA_UADD32(dst_hi, dst_lo, dst*ldb); + + row_R_to_R(src_hi, src_lo, dst_hi, dst_lo, row_size, &tag); + + tag ^= 1; + } + + DMA_WAIT(1<id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + + /* Fetch the command parameters + */ + a = cmd_parms->a; + b = cmd_parms->b; + c = cmd_parms->c; + + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + b_hi = mfc_ea2h(b); + b_lo = mfc_ea2l(b); + + c_hi = mfc_ea2h(c); + c_lo = mfc_ea2l(c); + + ld = cmd_parms->ld; + + lda = spu_extract(ld, 0); + ldb = spu_extract(ld, 1); + + dim = cmd_parms->dim; + + flags = cmd_parms->flags; + + b_blk = spu_maskw(spu_extract(flags, 0)); + + /* Computation of [C] -= [A][B] is performed in a surpetine pattern + * through the various sub-blocks of C. Below is a graphical attempt + * to explain the partitioning and order of the computation. For this + * example, consider the matrix-matrix multiply of a 5x5 (128x128 block) + * result after panel factorization of block 0,0 (bx,by). In this case, + * we must compute 128x128 blocks multiplies as follows: + * + * for (x=1; x<5; x++) { + * for (y=1; y<5; y++) { + * C(x,y) -= A(bx,y)*B(x,by); + * } + * } + * + * Assuming this computation is performed by 3 SPEs, the 16 blocks + * are subdivided as: + * + * SPE 0 : C(1,1), C(1,2), C(1,3), C(1,4), C(2,1), C(2,2) + * SPE 1 : C(2,3), C(2,4), C(3,1), C(3,2), C(3,3), C(3,4) + * SPE 2 : C(4,1), C(4,2), C(4,3), C(4,4) + * + * Therefore, SPE 1 will compute the resulting sub-blocks of C in the + * alphabetic order (a thru z) as marked below. + * + * X + * 0 1 2 3 4 + * +---B---+---+---+---+ + * 0 | | U row | + * | | | + * A---C---+---+---+---+ + * 1 | | | |i x| | + * | | | |j w| | + * + L +---+---+---+---+ + * Y 2 | | | |k v| | + * | p | | |l u| | + * + a +---+---+---+---+ + * 3 | n | |a h|m t| | + * | e | |b g|n s| | + * + l +---+---+---+---+ + * 4 | | |c f|o r| | + * | | |d e|p q| | + * +---+---+---+---+---+ + * + * Using 128x128 block partitioning amongst the SPEs results non-optimal + * load balancing of the SPEs. This is shown by the above example in which + * SPEs 0 and 1 compute 24 64x64 multiplies, while SPE 2 only computes + * 16 64x64 multiplies. In addition, the corner turn between sub-blocks + * 'h' and 'i' will incur extra DMAs. + * + * A more computational and transfer efficient load balance would be + * to allocate computation on the 64 sub-blocks. This would allocate + * 22,22,20 sub-block multiplies to each of the SPEs and the corner + * turn becomes efficient. The sub-block, computation (alphabetically + * ordered) for SPE 1 becomes: + * + * X + * 0 1 2 3 4 + * +---B---+---+---+---+ + * 0 | | U row | + * | | | + * A---C---+---+---+---+ + * 1 | | | j|k | | + * | | | i|l | | + * + L +---+---+---+---+ + * Y 2 | | | h|m | | + * | p | | g|n | | + * + a +---+---+---+---+ + * 3 | n | | f|o v| | + * | e | | e|p u| | + * + l +---+---+---+---+ + * 4 | | |a d|q t| | + * | | |b c|r s| | + * +---+---+---+---+---+ + * + * This more efficient method is employed in the following code. + */ + + w_sub = spu_extract(dim, 0); + h_sub_v = spu_shuffle(dim, dim, splat_1); + h_sub = spu_extract(h_sub_v, 0); + + h_sub2_v = spu_sl(h_sub_v, 1); + + sub_blocks = w_sub * h_sub; + sub_blocks_per_spe = (sub_blocks + HPL_ACCEL_SPES-1) / HPL_ACCEL_SPES; + + start_sub = ((unsigned short)id) * sub_blocks_per_spe; + end_sub = start_sub + sub_blocks_per_spe; + if (end_sub > sub_blocks) end_sub = sub_blocks; + + sub_blocks = end_sub - start_sub; + + if (LIKELY((int)sub_blocks > 0)) { + /* This SPE has some work to do + */ + DMA_WAIT_REQUEST(-1); + + /* Compute vectors for stepping the effective address matrix pointers. + * The pictograms below show 64x64 blocks within the 128x128 blocks. + * + * A (L panel) B (U panel) C matrix + * ++===+===++ ++---+---++---+---++ ++===+===++===+===++ + * || 1 | 2 || || 1 | 4 || 5 | || || 1 | || | || + * ++---+---++ ++---+---++---+---++ ++---+---++---+---++ + * || 3 | || || 2 | 3 || | || || 2 | || | || + * ++===+===++ ++---+---++---+---++ ++===+===++===+===++ + * || | || || | || | || + * ++---+---++ ++---+---++---+---++ + * || | || || 3 | 4 || | || + * ++===+===++ ++===+===++===+===++ + * + * a_step = {1 to 2, 2 to 3, 1 to 2, 2 to 3} + * b_step = {1 to 2, 2 to 3, 3 to 4, 4 to 5} + * c_stepv= {1 to 2, 1 to 2, 1 to 2, 1 to 2} + * c_steph= {3 to 4, 3 to 4, 3 to 4, 3 to 4} + */ + + a_step = spu_promote(lda * M_SUB, 0); + a_step = spu_shuffle(a_step, spu_sub(step_sub, a_step), shuf_0404); + + c_stepv = spu_splats(M_SUB*M_SUB*sizeof(double)); + c_steph = spu_shuffle(ld, ld, splat_2); + + b_step = spu_sel(spu_promote(ldb * M_SUB, 0), c_stepv, b_blk); + b_step = spu_shuffle(b_step, spu_sub(0, b_step), shuf_0044); + b_step = spu_sel(b_step, spu_sel(step_sub, spu_shuffle(ld, ld, splat_1), b_blk), mask_0101); + + ldb = spu_extract(spu_sel(spu_promote(ldb, 0), step_sub, b_blk), 0); + + /* Determine the following: + * 1) Starting sub-block - x_sub, y_sub + * 2) Number of sub-block multiplies before a corner turn - corner. + */ + x_sub = start_sub / h_sub; + y_sub = start_sub - h_sub * x_sub; + + start_x = x_sub / SUB; + y_sub = start_sub - h_sub*SUB*start_x; + + /* rotate = 4; + * + * if (x_sub & 1) { + * y_sub = h_sub - 1 - y_sub; + * a_step = spu_sub(0, a_step); + * c_stepv = spu_sub(0, c_stepv); + * rotate = -rotate; + * corner = 2*y_sub + 2 + * } else { + * corner = 2 * (h_sub-y_sub) + * } + */ + odd = x_sub & 1; + + down = spu_cmpeq(spu_splats(odd), 0); + + y_sub = spu_extract(spu_sel(spu_sub(h_sub2_v, spu_promote(y_sub + 1, 0)), + spu_promote(y_sub, 0), + down), 0); + + y_sub2_v = spu_splats(2*y_sub); + + corner = spu_sel(spu_add(y_sub2_v, 2), spu_sub(h_sub2_v, y_sub2_v), down); + + /* Compute the initial EA buffer pointers. + */ + a_addend = y_sub * spu_extract(step_sub, 0) + spu_extract(spu_andc(a_step, down), 0); + b_addend = spu_extract(spu_andc(b_step, down), 0); + c_addend = y_sub * spu_extract(c_stepv, 0); + + a_lo += a_addend; + + MATRIX_EA_UADD32(b_hi, b_lo, b_addend); + MATRIX_EA_UMADD32(b_hi, b_lo, x_sub, spu_extract(b_step, 1)); + MATRIX_EA_UADD32(c_hi, c_lo, c_addend); + MATRIX_EA_UMADD32(c_hi, c_lo, x_sub, spu_extract(c_steph, 0)); + + /* Adjust the pointer steps according to the initial direction. + */ + a_step = spu_sel(spu_sub(0, a_step), a_step, down); + b_step = spu_rlqwbyte(b_step, 8 & ~spu_extract(down, 0)); + c_stepv = spu_sel(spu_sub(0, c_stepv), c_stepv, down); + rotate = ((-4) ^ spu_extract(down, 0)) - spu_extract(down, 0); + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + + /* Download 3 blocks to get the process started. After that, each + * 64x64 block multiple requires 2 block transfers. + */ + dma_block_getl(&bufA[0][0], a_hi, a_lo, 0, lda); + + dma_block_getl(&bufB[0][0], b_hi, b_lo, 0, ldb); + + dma_block(&bufC[0][0], c_hi, c_lo, 0, MFC_GET_CMD); + + DMA_WAIT_REQUEST(1<<0); + + c_lo_prev = c_lo; + + a_lo += spu_extract(a_step, 0); + MATRIX_EA_ADD32(b_hi, b_lo, spu_extract(b_step, 0)); + + dma_block_getl(&bufA[1][0], a_hi, a_lo, 1, lda); + + dma_block_getl(&bufB[1][0], b_hi, b_lo, 1, ldb); + + phase = 0; + + i1 = 0; + a_idx = 0; + + /* For each C block, we perform 2 block computations + */ + for (i=0; i<(int)sub_blocks-1; i++) { + /* First block computation + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST((1<<1)|(1<<2)); + + mm_dp_64Cx64(&bufC[i1][0], &bufA[a_idx][0], &bufB[0][0]); + + a_step = spu_rlqwbyte(a_step, rotate); + + c_idx = i1 ^ 1; + + corner_eq2 = spu_cmpeq(corner, 2); + + /* if (corner == 2) { + * rotate = -rotate; + * a_step = 0-a_step; + * } else { + * a_lo += a_step; + * } + */ + rotate = (rotate ^ spu_extract(corner_eq2, 0)) - spu_extract(corner_eq2, 0); + a_lo += spu_extract(spu_andc(a_step, corner_eq2), 0); + a_step = spu_sel(a_step, spu_sub(0, a_step), corner_eq2); + + /* if corner != 2 then fetch next A buffer + * else "corner turn" fetch next B buffer + */ + b_step = spu_rlqwbyte(b_step, 4 & spu_extract(corner_eq2, 0)); + MATRIX_EA_ADD32(b_hi, b_lo, spu_extract(spu_and(b_step, corner_eq2), 0)); + + idx = spu_extract(spu_andc(spu_promote(a_idx, 0), corner_eq2), 0); + buf = spu_extract(spu_sel(spu_promote((unsigned int)bufA, 0), + spu_promote((unsigned int)bufB, 0), + corner_eq2), 0); + hi = spu_extract(spu_sel(spu_promote(a_hi, 0), + spu_promote(b_hi, 0), + corner_eq2), 0); + lo = spu_extract(spu_sel(spu_promote(a_lo, 0), + spu_promote(b_lo, 0), + corner_eq2), 0); + stride = spu_extract(spu_sel(spu_promote(lda, 0), + spu_promote(ldb, 0), + corner_eq2), 0); + + buf += idx * (unsigned int)(sizeof(bufA)/2); + +#ifdef __GNUC__ + /* The following lnop was added to keep gcc from unscheduling the + * series of add,stqd instruction pairs used to build the DMA list in + * dma_block_getl. + */ + si_lnop(); +#endif + + dma_block_getl((vec_double2 *)buf, hi, lo, 0, stride); + + /* if (corner == 2) { + * c_lo += c_steph; + * c_stepv = -c_stepv; + * } else { + * c_lo += c_stepv; + * } + */ +#ifdef MATRIX_4GB_CROSSING + c_hi_prev = c_hi; +#endif + c_lo_prev = c_lo; + c_addend = spu_extract(spu_sel(c_stepv, c_steph, corner_eq2), 0); + MATRIX_EA_ADD32(c_hi, c_lo, c_addend); + c_stepv = spu_sel(c_stepv, spu_sub(0, c_stepv), corner_eq2); + + /* Before getting another C buffer, we must wait for the previous + * one to be stored. + */ + DMA_WAIT_RECEIVE(); + dma_block(&bufC[c_idx][0], c_hi, c_lo, 0, MFC_GET_CMD); + + DMA_WAIT_REQUEST(1<<0); + + a_idx = phase^1; + + /* Second block computation + */ + c_ptr = &bufC[i1][0]; + + mm_dp_64Cx64(c_ptr, &bufA[a_idx][0], &bufB[1][0]); + + a_step = spu_rlqwbyte(a_step, rotate); + a_lo += spu_extract(a_step, 0); + + /* if corner != 2 then fetch next A buffer + * else "corner turn" fetch next B buffer + */ + b_step = spu_rlqwbyte(b_step, 4 & spu_extract(corner_eq2, 0)); + MATRIX_EA_ADD32(b_hi, b_lo, spu_extract(spu_and(b_step, corner_eq2), 0)); + + idx = spu_extract(spu_sel(spu_promote(a_idx, 0), vone, corner_eq2), 0); + buf = spu_extract(spu_sel(spu_promote((unsigned int)bufA, 0), + spu_promote((unsigned int)bufB, 0), + corner_eq2), 0); + hi = spu_extract(spu_sel(spu_promote(a_hi, 0), + spu_promote(b_hi, 0), + corner_eq2), 0); + lo = spu_extract(spu_sel(spu_promote(a_lo, 0), + spu_promote(b_lo, 0), + corner_eq2), 0); + stride = spu_extract(spu_sel(spu_promote(lda, 0), + spu_promote(ldb, 0), + corner_eq2), 0); + + buf += idx * (unsigned int)(sizeof(bufA)/2); + dma_block_getl((vec_double2 *)buf, hi, lo, 1, stride); + +#ifdef MATRIX_4GB_CROSSING + dma_block(c_ptr, c_hi_prev, c_lo_prev, 2, MFC_PUT_CMD); +#else + dma_block(c_ptr, c_hi, c_lo_prev, 2, MFC_PUT_CMD); +#endif + + corner = spu_sel(spu_add(corner, -2), h_sub2_v, corner_eq2); + phase ^= spu_extract(corner_eq2, 0) & 1; + + i1 ^= 1; + a_idx = phase; + } + + /* Finish the last sub-block */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST((1<<1)|(1<<2)); + + mm_dp_64Cx64(&bufC[i1][0], &bufA[a_idx][0], &bufB[0][0]); + + DMA_WAIT_RECEIVE(); + + mm_dp_64Cx64(&bufC[i1][0], &bufA[a_idx^1][0], &bufB[1][0]); + + dma_block(&bufC[i1][0], c_hi, c_lo, 1, MFC_PUT_CMD); + } + + /* Report completion status if requested. + */ + report_completion(id, cmd_parms->incomplete, 1); +} + + + Index: accel/lib/spu/accel_dgemm.h =================================================================== RCS file: accel/lib/spu/accel_dgemm.h diff -N accel/lib/spu/accel_dgemm.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_dgemm.h 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,164 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#ifndef _ACCEL_DGEMM_H_ +#define _ACCEL_DGEMM_H_ 1 + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" + +extern hpl_accel_init_parms_t parms; + +extern void mm_dp_64Cx64(vec_double2 *blkC, vec_double2 *blkA, vec_double2 *blkB); + +static inline void dma_block(vec_double2 *ls, unsigned int hi, unsigned int lo, unsigned int tag, unsigned int cmd) +{ + spu_mfcdma64(ls, hi, lo, 16384, tag, cmd); + spu_mfcdma64(ls+(16384/16), hi, lo+16384, 16384, tag, cmd); +} + +static inline void dma_block_getl(vec_double2 *ls, unsigned int hi, unsigned int lo, unsigned int tag, unsigned int stride) +{ + vec_uint4 *list; + vec_uint4 e0, e1, e2; + vec_uint4 stride2, stride4, stride6; + + /* Place the list at the end of the target LS buffer. + */ + list = (vec_uint4 *)ls + (((M_SUB*M_SUB*sizeof(double)) - (M_SUB*8)) / sizeof(vec_uint4)); + + /* Construct e0, e1, e2 and stride6 to contain + * + * e0 = {row size, lo+0*stride, row size, lo+1*stride} + * e1 = {row size, lo+2*stride, row size, lo+3*stride} + * e2 = {row size, lo+4*stride, row size, lo+5*stride} + * + * stride6 = {0, 6*stride, 0, 6*stride} + */ + + e0 = spu_add(spu_shuffle(spu_splats((unsigned int)(M_SUB*sizeof(double))), + spu_promote(lo, 0), + ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})), + spu_rlmaskqwbyte(spu_promote(stride, 0), -12)); + + + stride2 = spu_sl(spu_shuffle(spu_promote(stride, 0), spu_promote(stride, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})), 1); + stride4 = spu_add(stride2, stride2); + stride6 = spu_add(stride2, stride4); + + e1 = spu_add(e0, stride2); + + e2 = spu_add(e0, stride4); list[0] = e0; + e0 = spu_add(e0, stride6); list[1] = e1; + e1 = spu_add(e1, stride6); list[2] = e2; + e2 = spu_add(e2, stride6); list[3] = e0; + e0 = spu_add(e0, stride6); list[4] = e1; + e1 = spu_add(e1, stride6); list[5] = e2; + e2 = spu_add(e2, stride6); list[6] = e0; + e0 = spu_add(e0, stride6); list[7] = e1; + e1 = spu_add(e1, stride6); list[8] = e2; + e2 = spu_add(e2, stride6); list[9] = e0; + e0 = spu_add(e0, stride6); list[10] = e1; + e1 = spu_add(e1, stride6); list[11] = e2; + e2 = spu_add(e2, stride6); list[12] = e0; + e0 = spu_add(e0, stride6); list[13] = e1; + e1 = spu_add(e1, stride6); list[14] = e2; + e2 = spu_add(e2, stride6); list[15] = e0; + e0 = spu_add(e0, stride6); list[16] = e1; + e1 = spu_add(e1, stride6); list[17] = e2; + e2 = spu_add(e2, stride6); list[18] = e0; + e0 = spu_add(e0, stride6); list[19] = e1; + e1 = spu_add(e1, stride6); list[20] = e2; + e2 = spu_add(e2, stride6); list[21] = e0; + e0 = spu_add(e0, stride6); list[22] = e1; + e1 = spu_add(e1, stride6); list[23] = e2; + e2 = spu_add(e2, stride6); list[24] = e0; + e0 = spu_add(e0, stride6); list[25] = e1; + e1 = spu_add(e1, stride6); list[26] = e2; + e2 = spu_add(e2, stride6); list[27] = e0; + e0 = spu_add(e0, stride6); list[28] = e1; + e1 = spu_add(e1, stride6); list[29] = e2; + list[30] = e0; + list[31] = e1; + + /* Initiate the DMA transfer + */ + spu_mfcdma64(ls, hi, (unsigned int)list, 8*M_SUB, tag, MFC_GETL_CMD); +} + +static inline void dma_block_putl(vec_uint4 *list, vec_double2 *ls, unsigned int hi, unsigned int lo, unsigned int tag, unsigned int stride) +{ + vec_uint4 e0, e1, e2; + vec_uint4 stride2, stride4, stride6; + + /* Construct e0, e1, e2 and stride6 to contain + * + * e0 = {row size, lo+0*stride, row size, lo+1*stride} + * e1 = {row size, lo+2*stride, row size, lo+3*stride} + * e2 = {row size, lo+4*stride, row size, lo+5*stride} + * + * stride6 = {0, 6*stride, 0, 6*stride} + */ + + e0 = spu_add(spu_shuffle(spu_splats((unsigned int)(M_SUB*sizeof(double))), + spu_promote(lo, 0), + ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})), + spu_rlmaskqwbyte(spu_promote(stride, 0), -12)); + + + stride2 = spu_sl(spu_shuffle(spu_promote(stride, 0), spu_promote(stride, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})), 1); + stride4 = spu_add(stride2, stride2); + stride6 = spu_add(stride2, stride4); + + e1 = spu_add(e0, stride2); + + e2 = spu_add(e0, stride4); list[0] = e0; + e0 = spu_add(e0, stride6); list[1] = e1; + e1 = spu_add(e1, stride6); list[2] = e2; + e2 = spu_add(e2, stride6); list[3] = e0; + e0 = spu_add(e0, stride6); list[4] = e1; + e1 = spu_add(e1, stride6); list[5] = e2; + e2 = spu_add(e2, stride6); list[6] = e0; + e0 = spu_add(e0, stride6); list[7] = e1; + e1 = spu_add(e1, stride6); list[8] = e2; + e2 = spu_add(e2, stride6); list[9] = e0; + e0 = spu_add(e0, stride6); list[10] = e1; + e1 = spu_add(e1, stride6); list[11] = e2; + e2 = spu_add(e2, stride6); list[12] = e0; + e0 = spu_add(e0, stride6); list[13] = e1; + e1 = spu_add(e1, stride6); list[14] = e2; + e2 = spu_add(e2, stride6); list[15] = e0; + e0 = spu_add(e0, stride6); list[16] = e1; + e1 = spu_add(e1, stride6); list[17] = e2; + e2 = spu_add(e2, stride6); list[18] = e0; + e0 = spu_add(e0, stride6); list[19] = e1; + e1 = spu_add(e1, stride6); list[20] = e2; + e2 = spu_add(e2, stride6); list[21] = e0; + e0 = spu_add(e0, stride6); list[22] = e1; + e1 = spu_add(e1, stride6); list[23] = e2; + e2 = spu_add(e2, stride6); list[24] = e0; + e0 = spu_add(e0, stride6); list[25] = e1; + e1 = spu_add(e1, stride6); list[26] = e2; + e2 = spu_add(e2, stride6); list[27] = e0; + e0 = spu_add(e0, stride6); list[28] = e1; + e1 = spu_add(e1, stride6); list[29] = e2; + list[30] = e0; + list[31] = e1; + + /* Initiate the DMA transfer + */ + spu_mfcdma64(ls, hi, (unsigned int)list, 8*M_SUB, tag, MFC_PUTL_CMD); +} + + + +#endif /* _ACCEL_DGEMM_H_ */ Index: accel/lib/spu/accel_dgemm_C.c =================================================================== RCS file: accel/lib/spu/accel_dgemm_C.c diff -N accel/lib/spu/accel_dgemm_C.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_dgemm_C.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,229 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" + + +extern void mm_dp(int k, int m, int n, vector double *c, vector double *a, vector double *b); + + +/* Construct a DMA list assuming that there are 64 columns. If it is less, then they don't get used. + */ +static inline void construct_list(vec_uint4 *list, unsigned int lo, unsigned int stride, unsigned int elementsize) +{ + vec_uint4 e0, e1, e2; + vec_uint4 stride2, stride4, stride6; + + /* Construct e0, e1, e2 and stride6 to contain + * + * e0 = {row size, lo+0*stride, row size, lo+1*stride} + * e1 = {row size, lo+2*stride, row size, lo+3*stride} + * e2 = {row size, lo+4*stride, row size, lo+5*stride} + * + * stride6 = {0, 6*stride, 0, 6*stride} + */ + + e0 = spu_add(spu_shuffle(spu_promote(elementsize, 0), spu_promote(lo, 0), + ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})), + spu_rlmaskqwbyte(spu_promote(stride, 0), -12)); + + + stride2 = spu_sl(spu_shuffle(spu_promote(stride, 0), spu_promote(stride, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})), 1); + stride4 = spu_add(stride2, stride2); + stride6 = spu_add(stride2, stride4); + + e1 = spu_add(e0, stride2); + + e2 = spu_add(e0, stride4); list[0] = e0; + e0 = spu_add(e0, stride6); list[1] = e1; + e1 = spu_add(e1, stride6); list[2] = e2; + e2 = spu_add(e2, stride6); list[3] = e0; + e0 = spu_add(e0, stride6); list[4] = e1; + e1 = spu_add(e1, stride6); list[5] = e2; + e2 = spu_add(e2, stride6); list[6] = e0; + e0 = spu_add(e0, stride6); list[7] = e1; + e1 = spu_add(e1, stride6); list[8] = e2; + e2 = spu_add(e2, stride6); list[9] = e0; + e0 = spu_add(e0, stride6); list[10] = e1; + e1 = spu_add(e1, stride6); list[11] = e2; + e2 = spu_add(e2, stride6); list[12] = e0; + e0 = spu_add(e0, stride6); list[13] = e1; + e1 = spu_add(e1, stride6); list[14] = e2; + e2 = spu_add(e2, stride6); list[15] = e0; + e0 = spu_add(e0, stride6); list[16] = e1; + e1 = spu_add(e1, stride6); list[17] = e2; + e2 = spu_add(e2, stride6); list[18] = e0; + e0 = spu_add(e0, stride6); list[19] = e1; + e1 = spu_add(e1, stride6); list[20] = e2; + e2 = spu_add(e2, stride6); list[21] = e0; + e0 = spu_add(e0, stride6); list[22] = e1; + e1 = spu_add(e1, stride6); list[23] = e2; + e2 = spu_add(e2, stride6); list[24] = e0; + e0 = spu_add(e0, stride6); list[25] = e1; + e1 = spu_add(e1, stride6); list[26] = e2; + e2 = spu_add(e2, stride6); list[27] = e0; + e0 = spu_add(e0, stride6); list[28] = e1; + e1 = spu_add(e1, stride6); list[29] = e2; + list[30] = e0; + list[31] = e1; +} + + + +/* Double precision DGEMM matrix-matrix multiply for column-ordered + * matrices. + */ +void accel_dgemm_C_C_C(hpl_accel_init_parms_t *parms, + volatile hpl_accel_dgemm_parms_t *cmd_parms) +{ + int rows, next_rows; + unsigned int id, i, k, m, m_start, m_next, n; + unsigned int elementsize, idx, tag; + unsigned int blks, blks_per_spe, extra_blks; + unsigned long long a, b, c; /* ea pointers */ + unsigned int a_hi, a_lo; + unsigned int b_hi, b_lo; + unsigned int c_hi, c_lo; + unsigned int lda, ldb, ldc; + vec_uint4 ld, dim, *list, *c_list, *c_list_next; + vec_double2 *A, *B, *C; + void *ptrB; + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the command parameters + */ + a = cmd_parms->a; + b = cmd_parms->b; + c = cmd_parms->c; + + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + b_hi = mfc_ea2h(b); + b_lo = mfc_ea2l(b); + + c_hi = mfc_ea2h(c); + c_lo = mfc_ea2l(c); + + ld = cmd_parms->ld; + + lda = spu_extract(ld, 0); + ldb = spu_extract(ld, 1); + ldc = spu_extract(ld, 2); + + dim = cmd_parms->dim; + + n = spu_extract(dim, 0); + m = spu_extract(dim, 1); + k = spu_extract(dim, 2); + + /* Get a copy of B + */ + B = (void *)&bufB[0][0]; + ptrB = B; + DMA_WAIT_RECEIVE(); + for (i=0; i m) rows = m - m_start; + + a_lo += m_start * sizeof(double); + c_lo += m_start * sizeof(double); + + /* Fetch a block of A and C + */ + m = (rows > M_SUB) ? M_SUB : rows; + + elementsize = m * sizeof(double); + A = (void *)&bufA[0][0]; + list = (vec_uint4 *)A + (((M_SUB*M_SUB*sizeof(double)) - (M_SUB*8)) / sizeof(vec_uint4)); + construct_list(list, a_lo, lda, elementsize); + spu_mfcdma64(A, a_hi, (unsigned int)list, 8*k, 0, MFC_GETL_CMD); + + c_list = (vec_uint4 *)&bufB[1][0]; + construct_list(c_list, c_lo, ldc, elementsize); + spu_mfcdma64((vec_double2 *)&bufC[0][0], c_hi, (unsigned int)c_list, 8*n, 0, MFC_GETL_CMD); + + DMA_WAIT_REQUEST(1); + + tag = 1; + idx = 1; + next_rows = rows - M_SUB; + + while (next_rows > 0) { + /* Fetch the next block of A and C */ + a_lo += elementsize; + c_lo += elementsize; + + m_next = (next_rows > M_SUB) ? M_SUB : next_rows; + elementsize = m_next * sizeof(double); + A = (void *)&bufA[tag][0]; + list = (vec_uint4 *)A + (((M_SUB*M_SUB*sizeof(double)) - (M_SUB*8)) / sizeof(vec_uint4)); + construct_list(list, a_lo, lda, elementsize); + spu_mfcdma64(A, a_hi, (unsigned int)list, 8*k, tag, MFC_GETLB_CMD); + + c_list_next = (vec_uint4 *)&bufB[1][idx*M_SUB]; + construct_list(c_list_next, c_lo, ldc, elementsize); + spu_mfcdma64((vec_double2 *)&bufC[tag][0], c_hi, (unsigned int)c_list_next, 8*n, tag, MFC_GETL_CMD); + + /* Compute a block */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(1<incomplete, tag); +} Index: accel/lib/spu/accel_dgemm_CL.c =================================================================== RCS file: accel/lib/spu/accel_dgemm_CL.c diff -N accel/lib/spu/accel_dgemm_CL.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_dgemm_CL.c 14 May 2008 21:35:00 -0000 1.6 @@ -0,0 +1,231 @@ +/* -------------------------------------------------------------- */ +/* (C)Copyright 2007 */ +/* International Business Machines Corporation, */ +/* */ +/* All Rights Reserved. */ +/* -------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" + + +extern void mm_dp(int k, int m, int n, vector double *c, vector double *a, vector double *b); + + +/* Construct a DMA list assuming that there are 64 columns. If it is less, then they don't get used. + */ +static inline void construct_list(vec_uint4 *list, unsigned int lo, unsigned int stride, unsigned int elementsize) +{ + vec_uint4 e0, e1, e2; + vec_uint4 stride2, stride4, stride6; + + /* Construct e0, e1, e2 and stride6 to contain + * + * e0 = {row size, lo+0*stride, row size, lo+1*stride} + * e1 = {row size, lo+2*stride, row size, lo+3*stride} + * e2 = {row size, lo+4*stride, row size, lo+5*stride} + * + * stride6 = {0, 6*stride, 0, 6*stride} + */ + + e0 = spu_add(spu_shuffle(spu_promote(elementsize, 0), spu_promote(lo, 0), + ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})), + spu_rlmaskqwbyte(spu_promote(stride, 0), -12)); + + + stride2 = spu_sl(spu_shuffle(spu_promote(stride, 0), spu_promote(stride, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})), 1); + stride4 = spu_add(stride2, stride2); + stride6 = spu_add(stride2, stride4); + + e1 = spu_add(e0, stride2); + + e2 = spu_add(e0, stride4); list[0] = e0; + e0 = spu_add(e0, stride6); list[1] = e1; + e1 = spu_add(e1, stride6); list[2] = e2; + e2 = spu_add(e2, stride6); list[3] = e0; + e0 = spu_add(e0, stride6); list[4] = e1; + e1 = spu_add(e1, stride6); list[5] = e2; + e2 = spu_add(e2, stride6); list[6] = e0; + e0 = spu_add(e0, stride6); list[7] = e1; + e1 = spu_add(e1, stride6); list[8] = e2; + e2 = spu_add(e2, stride6); list[9] = e0; + e0 = spu_add(e0, stride6); list[10] = e1; + e1 = spu_add(e1, stride6); list[11] = e2; + e2 = spu_add(e2, stride6); list[12] = e0; + e0 = spu_add(e0, stride6); list[13] = e1; + e1 = spu_add(e1, stride6); list[14] = e2; + e2 = spu_add(e2, stride6); list[15] = e0; + e0 = spu_add(e0, stride6); list[16] = e1; + e1 = spu_add(e1, stride6); list[17] = e2; + e2 = spu_add(e2, stride6); list[18] = e0; + e0 = spu_add(e0, stride6); list[19] = e1; + e1 = spu_add(e1, stride6); list[20] = e2; + e2 = spu_add(e2, stride6); list[21] = e0; + e0 = spu_add(e0, stride6); list[22] = e1; + e1 = spu_add(e1, stride6); list[23] = e2; + e2 = spu_add(e2, stride6); list[24] = e0; + e0 = spu_add(e0, stride6); list[25] = e1; + e1 = spu_add(e1, stride6); list[26] = e2; + e2 = spu_add(e2, stride6); list[27] = e0; + e0 = spu_add(e0, stride6); list[28] = e1; + e1 = spu_add(e1, stride6); list[29] = e2; + list[30] = e0; + list[31] = e1; +} + + + +/* Double precision DGEMM matrix-matrix multiply for column-ordered + * matrices. + */ +void accel_dgemm_CL_C_C(hpl_accel_init_parms_t *parms, + volatile hpl_accel_dgemm_parms_t *cmd_parms) +{ + int rows, next_rows; + unsigned int id, i, k, m, m_start, m_next, n; + unsigned int elementsize, idx, tag; + unsigned int blks, blks_per_spe, extra_blks; + unsigned long long a, b, c; /* ea pointers */ + unsigned int a_hi, a_lo; + unsigned int b_hi, b_lo; + unsigned int c_hi, c_lo; + unsigned int lda, ldb, ldc; + vec_uint4 ld, dim, *list, *c_list, *c_list_next; + vec_double2 *A, *B, *C; + void *ptrB; + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the command parameters + */ + a = cmd_parms->a; + b = cmd_parms->b; + c = cmd_parms->c; + + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + b_hi = mfc_ea2h(b); + b_lo = mfc_ea2l(b); + + c_hi = mfc_ea2h(c); + c_lo = mfc_ea2l(c); + + ld = cmd_parms->ld; + + lda = spu_extract(ld, 0); + ldb = spu_extract(ld, 1); + ldc = spu_extract(ld, 2); + + dim = cmd_parms->dim; + + n = spu_extract(dim, 0); + m = spu_extract(dim, 1); + k = spu_extract(dim, 2); + + /* Get a copy of B + */ + B = (void *)&bufB[0][0]; + ptrB = B; + DMA_WAIT_RECEIVE(); + for (i=0; i m) rows = m - m_start; + + a_lo += m_start * sizeof(double); + c_lo += m_start * sizeof(double); + + /* Fetch a block of A and C + */ + m = (rows > M_SUB) ? M_SUB : rows; + + elementsize = m * sizeof(double); + A = (void *)&bufA[0][0]; + list = (vec_uint4 *)A + (((M_SUB*M_SUB*sizeof(double)) - (M_SUB*8)) / sizeof(vec_uint4)); + construct_list(list, a_lo, lda, elementsize); + spu_mfcdma64(A, a_hi, (unsigned int)list, 8*k, 0, MFC_GETL_CMD); + + c_list = (vec_uint4 *)&bufB[1][0]; + construct_list(c_list, c_lo, ldc, elementsize); + spu_mfcdma64((vec_double2 *)&bufC[0][0], c_hi, (unsigned int)c_list, 8*n, 0, MFC_GETL_CMD); + + DMA_WAIT_REQUEST(1); + + tag = 1; + idx = 1; + next_rows = rows - M_SUB; + + while (next_rows > 0) { + /* Fetch the next block of A and C */ + a_lo += elementsize; + c_lo += elementsize; + + m_next = (next_rows > M_SUB) ? M_SUB : next_rows; + elementsize = m_next * sizeof(double); + A = (void *)&bufA[tag][0]; + list = (vec_uint4 *)A + (((M_SUB*M_SUB*sizeof(double)) - (M_SUB*8)) / sizeof(vec_uint4)); + construct_list(list, a_lo, lda, elementsize); + spu_mfcdma64(A, a_hi, (unsigned int)list, 8*k, tag, MFC_GETLB_CMD); + + c_list_next = (vec_uint4 *)&bufB[1][idx*M_SUB]; + construct_list(c_list_next, c_lo, ldc, elementsize); + spu_mfcdma64((vec_double2 *)&bufC[tag][0], c_hi, (unsigned int)c_list_next, 8*n, tag, MFC_GETL_CMD); + + /* Compute a block */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(1<incomplete, tag); +} Index: accel/lib/spu/accel_dgemm_panel.c =================================================================== RCS file: accel/lib/spu/accel_dgemm_panel.c diff -N accel/lib/spu/accel_dgemm_panel.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_dgemm_panel.c 23 Oct 2008 21:20:24 -0000 1.5 @@ -0,0 +1,585 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include "accel_dgemm.h" + +/* transpose_and_swap + * ------------------ + * For a 64x64 matrix m, inplace transpose the matrix and byte swap the contents. + */ + +static void transpose_and_swap(vec_double2 m[]) +{ + int i, j; + vec_double2 *row, *col; +#ifdef ACCEL_LITTLE_ENDIAN + vec_uchar16 pat_even = (vec_uchar16){7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; +#else + vec_uchar16 pat_even = (vec_uchar16){0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23}; +#endif + vec_uchar16 pat_odd; + vec_double2 r00, r01, r10, r11, r20, r21, r30, r31; + vec_double2 c00, c01, c10, c11, c20, c21, c30, c31; + + pat_odd = spu_or(pat_even, 8); + + /* Perform transpose and swap on 4x4 micro blocks + */ + for (i=0; i<64; i+=4) { + /* Transpose and swap the micro block on the diagonal. For example, consider + * the 16x16 matrix consisting of the following 16 micro blocks. The following + * code transposes the micro block along the diagonal, as marked by the "X". + * + * +---+---+---+---+ + * | X | | | | + * +---+---+---+---+ + * | | X | | | + * +---+---+---+---+ + * | | | X | | + * +---+---+---+---+ + * | | | | X | + * +---+---+---+---+ + */ + r00 = m[0*32+0]; + r01 = m[0*32+1]; + r10 = m[1*32+0]; + r11 = m[1*32+1]; + r20 = m[2*32+0]; + r21 = m[2*32+1]; + r30 = m[3*32+0]; + r31 = m[3*32+1]; + + m[0*32+0] = spu_shuffle(r00, r10, pat_even); + m[0*32+1] = spu_shuffle(r20, r30, pat_even); + m[1*32+0] = spu_shuffle(r00, r10, pat_odd); + m[1*32+1] = spu_shuffle(r20, r30, pat_odd); + m[2*32+0] = spu_shuffle(r01, r11, pat_even); + m[2*32+1] = spu_shuffle(r21, r31, pat_even); + m[3*32+0] = spu_shuffle(r01, r11, pat_odd); + m[3*32+1] = spu_shuffle(r21, r31, pat_odd); + + row = m + 2; + col = m + 4*32; + + for (j=i+4; j<64; j+=4) { + /* Tranpose and swap the micro blocks across the diagonal. For example, consider + * the 16x16 matrix consisting of the following 16 micro blocks. For each row + * of micro blocks, the row blocks to the right of the diagonal are transposed + * and swap with the column blocks below the diagonal. In our example, the first + * row, row block A is transposed and swap with column block 'a'. Likewise for + * 'B' and 'b'; and 'C' and 'c'. + * + * +---+---+---+---+ + * | | A | B | C | + * +---+---+---+---+ + * | a | | D | E | + * +---+---+---+---+ + * | b | d | | F | + * +---+---+---+---+ + * | c | e | f | | + * +---+---+---+---+ + */ + r00 = row[0*32+0]; + r01 = row[0*32+1]; + r10 = row[1*32+0]; + r11 = row[1*32+1]; + r20 = row[2*32+0]; + r21 = row[2*32+1]; + r30 = row[3*32+0]; + r31 = row[3*32+1]; + + c00 = col[0*32+0]; + c01 = col[0*32+1]; + c10 = col[1*32+0]; + c11 = col[1*32+1]; + c20 = col[2*32+0]; + c21 = col[2*32+1]; + c30 = col[3*32+0]; + c31 = col[3*32+1]; + + row[0*32+0] = spu_shuffle(c00, c10, pat_even); + row[0*32+1] = spu_shuffle(c20, c30, pat_even); + row[1*32+0] = spu_shuffle(c00, c10, pat_odd); + row[1*32+1] = spu_shuffle(c20, c30, pat_odd); + + col[0*32+0] = spu_shuffle(r00, r10, pat_even); + col[0*32+1] = spu_shuffle(r20, r30, pat_even); + col[1*32+0] = spu_shuffle(r00, r10, pat_odd); + col[1*32+1] = spu_shuffle(r20, r30, pat_odd); + + row[2*32+0] = spu_shuffle(c01, c11, pat_even); + row[2*32+1] = spu_shuffle(c21, c31, pat_even); + row[3*32+0] = spu_shuffle(c01, c11, pat_odd); + row[3*32+1] = spu_shuffle(c21, c31, pat_odd); + + col[2*32+0] = spu_shuffle(r01, r11, pat_even); + col[2*32+1] = spu_shuffle(r21, r31, pat_even); + col[3*32+0] = spu_shuffle(r01, r11, pat_odd); + col[3*32+1] = spu_shuffle(r21, r31, pat_odd); + + row += 2; /* Advance pointer to next row micro block */ + col += 4*32; /* Advance pointer to next column micro block */ + } + + m += 4*32+2; + } +} + +void accel_dgemm_panel(hpl_accel_init_parms_t *parms, + volatile hpl_accel_dgemm_parms_t *cmd_parms) +{ + int i; + int rotate; + unsigned int id; + unsigned int idx, a_idx, c_idx; + unsigned int i1, phase; + unsigned long long a, b, c, p; /* ea pointers */ + unsigned int hi, lo; + unsigned int a_hi, a_lo; + unsigned int b_hi, b_lo; + unsigned int c_hi, c_lo; + unsigned int p_hi, p_lo; + unsigned int sub_blocks, sub_blocks_per_spe; + unsigned int start_x, start_sub, end_sub; + unsigned int odd, buf; + unsigned int x_sub, y_sub; + unsigned int w_sub, h_sub; /* width & height in sub_blocks */ + unsigned int lda, ldb, ldp, stride; + unsigned int a_addend, b_addend, c_addend, p_addend; + vec_uint4 vone = (vec_uint4){1, 1, 1, 1}; + vec_uint4 ld, flags, b_blk; + vec_uint4 a_step, b_step, c_stepv, c_steph, p_stepv, p_steph; + vec_uint4 dim; + vec_uint4 h_sub_v, h_sub2_v, y_sub2_v; + vec_uint4 down, corner; + vec_uint4 corner_eq2; + vec_uint4 step_sub = spu_splats(M_SUB*sizeof(double)); + vec_uint4 mask_0101 = (vec_uint4){0,-1,0,-1}; + vec_uint4 list[2][M_SUB/2]; + vec_uchar16 splat_1 = (vec_uchar16)spu_splats((unsigned int)0x04050607); + vec_uchar16 splat_2 = (vec_uchar16)spu_splats((unsigned int)0x08090A0B); + vec_uchar16 shuf_0404 = (vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19}; + vec_uchar16 shuf_0044 = (vec_uchar16){0,1,2,3, 0,1,2,3, 16,17,18,19, 16,17,18,19}; + vec_double2 *c_ptr; + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + + /* Fetch the command parameters + */ + a = cmd_parms->a; + b = cmd_parms->b; + c = cmd_parms->c; + p = cmd_parms->p; + + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + b_hi = mfc_ea2h(b); + b_lo = mfc_ea2l(b); + + c_hi = mfc_ea2h(c); + c_lo = mfc_ea2l(c); + + p_hi = mfc_ea2h(p); + p_lo = mfc_ea2l(p); + + ld = cmd_parms->ld; + + lda = spu_extract(ld, 0); + ldb = spu_extract(ld, 1); + ldp = spu_extract(ld, 3); + + dim = cmd_parms->dim; + + flags = cmd_parms->flags; + + b_blk = spu_maskw(spu_extract(flags, 0)); + + /* Computation of [C] -= [A][B] is performed in a surpetine pattern + * through the various sub-blocks of C. Below is a graphical attempt + * to explain the partitioning and order of the computation. For this + * example, consider the matrix-matrix multiply of a 5x5 (128x128 block) + * result after panel factorization of block 0,0 (bx,by). In this case, + * we must compute 128x128 blocks multiplies as follows: + * + * for (x=1; x<5; x++) { + * for (y=1; y<5; y++) { + * C(x,y) -= A(bx,y)*B(x,by); + * } + * } + * + * Assuming this computation is performed by 3 SPEs, the 16 blocks + * are subdivided as: + * + * SPE 0 : C(1,1), C(1,2), C(1,3), C(1,4), C(2,1), C(2,2) + * SPE 1 : C(2,3), C(2,4), C(3,1), C(3,2), C(3,3), C(3,4) + * SPE 2 : C(4,1), C(4,2), C(4,3), C(4,4) + * + * Therefore, SPE 1 will compute the resulting sub-blocks of C in the + * alphabetic order (a thru z) as marked below. + * + * X + * 0 1 2 3 4 + * +---B---+---+---+---+ + * 0 | | U row | + * | | | + * A---C---+---+---+---+ + * 1 | | | |i x| | + * | | | |j w| | + * + L +---+---+---+---+ + * Y 2 | | | |k v| | + * | p | | |l u| | + * + a +---+---+---+---+ + * 3 | n | |a h|m t| | + * | e | |b g|n s| | + * + l +---+---+---+---+ + * 4 | | |c f|o r| | + * | | |d e|p q| | + * +---+---+---+---+---+ + * + * Using 128x128 block partitioning amongst the SPEs results non-optimal + * load balancing of the SPEs. This is shown by the above example in which + * SPEs 0 and 1 compute 24 64x64 multiplies, while SPE 2 only computes + * 16 64x64 multiplies. In addition, the corner turn between sub-blocks + * 'h' and 'i' will incur extra DMAs. + * + * A more computational and transfer efficient load balance would be + * to allocate computation on the 64 sub-blocks. This would allocate + * 22,22,20 sub-block multiplies to each of the SPEs and the corner + * turn becomes efficient. The sub-block, computation (alphabetically + * ordered) for SPE 1 becomes: + * + * X + * 0 1 2 3 4 + * +---B---+---+---+---+ + * 0 | | U row | + * | | | + * A---C---+---+---+---+ + * 1 | | | j|k | | + * | | | i|l | | + * + L +---+---+---+---+ + * Y 2 | | | h|m | | + * | p | | g|n | | + * + a +---+---+---+---+ + * 3 | n | | f|o v| | + * | e | | e|p u| | + * + l +---+---+---+---+ + * 4 | | |a d|q t| | + * | | |b c|r s| | + * +---+---+---+---+---+ + * + * This more efficient method is employed in the following code. + */ + + w_sub = spu_extract(dim, 0); + h_sub_v = spu_shuffle(dim, dim, splat_1); + h_sub = spu_extract(h_sub_v, 0); + + h_sub2_v = spu_sl(h_sub_v, 1); + + sub_blocks = w_sub * h_sub; + sub_blocks_per_spe = (sub_blocks + HPL_ACCEL_SPES-1) / HPL_ACCEL_SPES; + + start_sub = ((unsigned short)id) * sub_blocks_per_spe; + end_sub = start_sub + sub_blocks_per_spe; + if (end_sub > sub_blocks) end_sub = sub_blocks; + + sub_blocks = end_sub - start_sub; + + if (LIKELY((int)sub_blocks > 0)) { + /* This SPE has some work to do + */ + DMA_WAIT_REQUEST(-1); + + /* Compute vectors for stepping the effective address matrix pointers. + * The pictograms below show 64x64 blocks within the 128x128 blocks. + * + * A (L panel) B (U panel) C matrix + * ++===+===++ ++---+---++---+---++ ++===+===++===+===++ + * || 1 | 2 || || 1 | 4 || 5 | || || 1 | || | || + * ++---+---++ ++---+---++---+---++ ++---+---++---+---++ + * || 3 | || || 2 | 3 || | || || 2 | || | || + * ++===+===++ ++---+---++---+---++ ++===+===++===+===++ + * || | || || | || | || + * ++---+---++ ++---+---++---+---++ + * || | || || 3 | 4 || | || + * ++===+===++ ++===+===++===+===++ + * + * + * P (output matrix) + * ++===+===++===+===++ + * || 1 | || | || + * ++---+---++---+---++ + * || 2 | || | || + * ++===+===++===+===++ + * || | || | || + * ++---+---++---+---++ + * || 3 | 4 || | || + * ++===+===++===+===++ + * a_step = {1 to 2, 2 to 3, 1 to 2, 2 to 3} + * b_step = {1 to 2, 2 to 3, 3 to 4, 4 to 5} + * c_stepv= {1 to 2, 1 to 2, 1 to 2, 1 to 2} + * c_steph= {3 to 4, 3 to 4, 3 to 4, 3 to 4} + * p_stepv= {1 to 2, 1 to 2, 1 to 2, 1 to 2} + * p_steph= {3 to 4, 3 to 4, 3 to 4, 3 to 4} + */ + + a_step = spu_promote(lda * M_SUB, 0); + a_step = spu_shuffle(a_step, spu_sub(step_sub, a_step), shuf_0404); + + c_stepv = spu_splats(M_SUB*M_SUB*sizeof(double)); + c_steph = spu_shuffle(ld, ld, splat_2); + + p_stepv = step_sub; + p_steph = spu_promote(ldp * M_SUB, 0); + + b_step = spu_sel(spu_promote(ldb * M_SUB, 0), c_stepv, b_blk); + b_step = spu_shuffle(b_step, spu_sub(0, b_step), shuf_0044); + b_step = spu_sel(b_step, spu_sel(step_sub, spu_shuffle(ld, ld, splat_1), b_blk), mask_0101); + + ldb = spu_extract(spu_sel(spu_promote(ldb, 0), step_sub, b_blk), 0); + + /* Determine the following: + * 1) Starting sub-block - x_sub, y_sub + * 2) Number of sub-block multiplies before a corner turn - corner. + */ + x_sub = start_sub / h_sub; + y_sub = start_sub - h_sub * x_sub; + + start_x = x_sub / SUB; + y_sub = start_sub - h_sub*SUB*start_x; + + /* rotate = 4; + * + * if (x_sub & 1) { + * y_sub = h_sub - 1 - y_sub; + * a_step = spu_sub(0, a_step); + * c_stepv = spu_sub(0, c_stepv); + * p_stepv = spu_sub(0, p_stepv); + * rotate = -rotate; + * corner = 2*y_sub + 2 + * } else { + * corner = 2 * (h_sub-y_sub) + * } + */ + odd = x_sub & 1; + + down = spu_cmpeq(spu_splats(odd), 0); + + y_sub = spu_extract(spu_sel(spu_sub(h_sub2_v, spu_promote(y_sub + 1, 0)), + spu_promote(y_sub, 0), + down), 0); + + y_sub2_v = spu_splats(2*y_sub); + + corner = spu_sel(spu_add(y_sub2_v, 2), spu_sub(h_sub2_v, y_sub2_v), down); + + /* Compute the initial EA buffer pointers. + */ + a_addend = y_sub * spu_extract(step_sub, 0) + spu_extract(spu_andc(a_step, down), 0); + b_addend = spu_extract(spu_andc(b_step, down), 0); + c_addend = y_sub * spu_extract(c_stepv, 0); + p_addend = y_sub * spu_extract(p_stepv, 0) + x_sub * spu_extract(p_steph, 0); + + a_lo += a_addend; + p_lo += p_addend; + MATRIX_EA_UADD32(b_hi, b_lo, b_addend); + MATRIX_EA_UMADD32(b_hi, b_lo, x_sub, spu_extract(b_step, 1)); + MATRIX_EA_UADD32(c_hi, c_lo, c_addend); + MATRIX_EA_UMADD32(c_hi, c_lo, x_sub, spu_extract(c_steph, 0)); + + /* Adjust the pointer steps according to the initial direction. + */ + a_step = spu_sel(spu_sub(0, a_step), a_step, down); + b_step = spu_rlqwbyte(b_step, 8 & ~spu_extract(down, 0)); + c_stepv = spu_sel(spu_sub(0, c_stepv), c_stepv, down); + p_stepv = spu_sel(spu_sub(0, p_stepv), p_stepv, down); + rotate = ((-4) ^ spu_extract(down, 0)) - spu_extract(down, 0); + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + + /* Download 3 blocks to get the process started. After that, each + * 64x64 block multiple requires 2 block transfers. + */ + dma_block_getl(&bufA[0][0], a_hi, a_lo, 0, lda); + + dma_block_getl(&bufB[0][0], b_hi, b_lo, 0, ldb); + + dma_block(&bufC[0][0], c_hi, c_lo, 0, MFC_GET_CMD); + + DMA_WAIT_REQUEST(1<<0); + + a_lo += spu_extract(a_step, 0); + MATRIX_EA_ADD32(b_hi, b_lo, spu_extract(b_step, 0)); + + dma_block_getl(&bufA[1][0], a_hi, a_lo, 1, lda); + + dma_block_getl(&bufB[1][0], b_hi, b_lo, 1, ldb); + + phase = 0; + + i1 = 0; + a_idx = 0; + + for (i=0; i<(int)sub_blocks-1; i++) { + /* First block computation + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST((1<<1)|(1<<2)); + + mm_dp_64Cx64(&bufC[i1][0], &bufA[a_idx][0], &bufB[0][0]); + + a_step = spu_rlqwbyte(a_step, rotate); + + c_idx = i1 ^ 1; + + corner_eq2 = spu_cmpeq(corner, 2); + + /* if (corner == 2) { + * rotate = -rotate; + * a_step = 0-a_step; + * } else { + * a_lo += a_step; + * } + */ + rotate = (rotate ^ spu_extract(corner_eq2, 0)) - spu_extract(corner_eq2, 0); + a_lo += spu_extract(spu_andc(a_step, corner_eq2), 0); + a_step = spu_sel(a_step, spu_sub(0, a_step), corner_eq2); + + /* if corner != 2 then fetch next A buffer + * else "corner turn" fetch next B buffer + */ + b_step = spu_rlqwbyte(b_step, 4 & spu_extract(corner_eq2, 0)); + MATRIX_EA_ADD32(b_hi, b_lo, spu_extract(spu_and(b_step, corner_eq2), 0)); + + idx = spu_extract(spu_andc(spu_promote(a_idx, 0), corner_eq2), 0); + buf = spu_extract(spu_sel(spu_promote((unsigned int)bufA, 0), + spu_promote((unsigned int)bufB, 0), + corner_eq2), 0); + hi = spu_extract(spu_sel(spu_promote(a_hi, 0), + spu_promote(b_hi, 0), + corner_eq2), 0); + lo = spu_extract(spu_sel(spu_promote(a_lo, 0), + spu_promote(b_lo, 0), + corner_eq2), 0); + stride = spu_extract(spu_sel(spu_promote(lda, 0), + spu_promote(ldb, 0), + corner_eq2), 0); + + buf += idx * (unsigned int)(sizeof(bufA)/2); + +#ifdef __GNUC__ + /* The following lnop was added to keep gcc from unscheduling the + * series of add,stqd instruction pairs used to build the DMA list in + * dma_block_getl. + */ + si_lnop(); +#endif + + dma_block_getl((vec_double2 *)buf, hi, lo, 0, stride); + + /* if (corner == 2) { + * c_lo += c_steph; + * c_stepv = -c_stepv; + * } else { + * c_lo += c_stepv; + * } + */ + c_addend = spu_extract(spu_sel(c_stepv, c_steph, corner_eq2), 0); + MATRIX_EA_ADD32(c_hi, c_lo, c_addend); + c_stepv = spu_sel(c_stepv, spu_sub(0, c_stepv), corner_eq2); + + /* Before getting another C buffer, we must wait for the previous + * one to be stored. + */ + DMA_WAIT_RECEIVE(); + dma_block(&bufC[c_idx][0], c_hi, c_lo, 0, MFC_GET_CMD); + + DMA_WAIT_REQUEST(1<<0); + + a_idx = phase^1; + + /* Second block computation + */ + c_ptr = &bufC[i1][0]; + + mm_dp_64Cx64(c_ptr, &bufA[a_idx][0], &bufB[1][0]); + + a_step = spu_rlqwbyte(a_step, rotate); + a_lo += spu_extract(a_step, 0); + + /* if corner != 2 then fetch next A buffer + * else "corner turn" fetch next B buffer + */ + b_step = spu_rlqwbyte(b_step, 4 & spu_extract(corner_eq2, 0)); + MATRIX_EA_ADD32(b_hi, b_lo, spu_extract(spu_and(b_step, corner_eq2), 0)); + + idx = spu_extract(spu_sel(spu_promote(a_idx, 0), vone, corner_eq2), 0); + buf = spu_extract(spu_sel(spu_promote((unsigned int)bufA, 0), + spu_promote((unsigned int)bufB, 0), + corner_eq2), 0); + hi = spu_extract(spu_sel(spu_promote(a_hi, 0), + spu_promote(b_hi, 0), + corner_eq2), 0); + lo = spu_extract(spu_sel(spu_promote(a_lo, 0), + spu_promote(b_lo, 0), + corner_eq2), 0); + stride = spu_extract(spu_sel(spu_promote(lda, 0), + spu_promote(ldb, 0), + corner_eq2), 0); + + buf += idx * (unsigned int)(sizeof(bufA)/2); + dma_block_getl((vec_double2 *)buf, hi, lo, 1, stride); + + /* Transpose and swap the resulting block + * + * if (corner == 2) { + * p_lo += p_steph; + * p_stepv = -p_stepv; + * } else { + * p_lo += p_stepv; + * } + */ + transpose_and_swap(&bufC[i1][0]); + + dma_block_putl(&list[i1][0], c_ptr, p_hi, p_lo, 2, ldp); + + p_lo += spu_extract(spu_sel(p_stepv, p_steph, corner_eq2), 0); + p_stepv = spu_sel(p_stepv, spu_sub(0, p_stepv), corner_eq2); + + corner = spu_sel(spu_add(corner, -2), h_sub2_v, corner_eq2); + phase ^= spu_extract(corner_eq2, 0) & 1; + + i1 ^= 1; + a_idx = phase; + } + + /* Finish the last sub-block */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST((1<<1)|(1<<2)); + + mm_dp_64Cx64(&bufC[i1][0], &bufA[a_idx][0], &bufB[0][0]); + + DMA_WAIT_RECEIVE(); + + mm_dp_64Cx64(&bufC[i1][0], &bufA[a_idx^1][0], &bufB[1][0]); + + /* Transpose and swap the resulting block + */ + transpose_and_swap(&bufC[i1][0]); + dma_block_putl(&list[i1][0], &bufC[i1][0], p_hi, p_lo, 1, ldp); + } + + /* Report completion status if requested. + */ + report_completion(id, cmd_parms->incomplete, 1); +} Index: accel/lib/spu/accel_dtrsm.c =================================================================== RCS file: accel/lib/spu/accel_dtrsm.c diff -N accel/lib/spu/accel_dtrsm.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_dtrsm.c 20 Aug 2008 03:57:53 -0000 1.4 @@ -0,0 +1,154 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_dtrsm.h" + + +void accel_dtrsm(hpl_accel_init_parms_t *parms, + volatile hpl_accel_dtrsm_parms_t *cmd_parms) +{ + int i; + unsigned int idx, tag, next_tag; + unsigned int size, lda, stride; + unsigned int id; + unsigned long long a, b; + unsigned int a_hi, a_lo; + unsigned int b_hi, b_lo; + unsigned int list; + unsigned int n; + vec_uint4 ld; + vec_uint4 element, stride2, stride4, stride6; + volatile void *lsa; + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the parameters + */ + a = cmd_parms->a; + b = cmd_parms->b; + ld = cmd_parms->ld; + + lda = spu_extract(ld, 0); + + /* DMA the entire 128x128 unit lower triangle into the LS. To reduce startup + * time, we will download only the necessary data columns in groups of 16 + * while preserving the cacheline alignment. The download will be done + * starting from the smallest column to the largest. + */ + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + lsa = (volatile void *)(&bufA_128x128[0]); + size = 128*sizeof(double); + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + + for (i=0; i<127; i++) { + unsigned int adjust; + + spu_mfcdma64(lsa, a_hi, a_lo, size, 0, MFC_GET_CMD); + + a_lo += lda; + lsa += 128*sizeof(double); + + /* Compute the next DMA parameters + */ + adjust = spu_extract(spu_and(spu_cmpeq(spu_promote((i & 15), 0), 14), 16*sizeof(double)), 0); + + a_lo += adjust; + lsa += adjust; + size -= adjust; + } + + n = spu_extract(cmd_parms->dim, 0) / 16; + b_hi = mfc_ea2h(b); + b_lo = mfc_ea2l(b); + + b_lo += 16 * sizeof(double) * id; + + /* Download the initial set of 16 B columns + */ + stride = spu_extract(ld, 1); + + element = spu_add(spu_shuffle(spu_splats((unsigned int)(16*sizeof(double))), + spu_promote(b_lo, 0), + ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})), + spu_rlmaskqwbyte(spu_promote(stride, 0), -12)); + + stride2 = spu_sl(spu_shuffle(ld, ld, ((vec_uchar16){128,128,128,128, 4,5,6,7, 128,128,128,128, 4,5,6,7})), 1); + stride4 = spu_add(stride2, stride2); + stride6 = spu_add(stride2, stride4); + + fill_dma_list(&bufB_list[0][0], element, stride2, stride4, stride6); + spu_mfcdma64(&bufB_128x16[0][0], b_hi, (unsigned int)(&bufB_list[0][0]), 128*8, 0, MFC_GETL_CMD); + + + idx = 1; + next_tag = 0; + tag = 1; + + DMA_WAIT_REQUEST(1<<0); + + for (i=id+HPL_ACCEL_SPES; i<(int)n; i+=HPL_ACCEL_SPES) { + /* Fetch the next buffer + */ + element = spu_add(element, ((vec_uint4){0, HPL_ACCEL_SPES*16*sizeof(double), 0, HPL_ACCEL_SPES*16*sizeof(double)})); + + fill_dma_list(&bufB_list[idx][0], element, stride2, stride4, stride6); + spu_mfcdma64(&bufB_128x16[tag][0], b_hi, (unsigned int)(&bufB_list[idx][0]), 128*8, tag, MFC_GETLB_CMD); + tag ^= 1; + + /* Wait for the previous get to complete */ + DMA_WAIT_RECEIVE(); + + /* Perform the dtrsm. + */ + dtrsm_dp_128Cx16(&bufA[0][0], &bufB_128x16[tag][0]); + + idx = (idx + 1) & 3; + + list = (unsigned int)&bufB_list[idx^2][0]; + spu_mfcdma64(&bufB_128x16[tag][0], b_hi, list, 128*8, tag, MFC_PUTL_CMD); + + next_tag = tag ^ 1; + + DMA_WAIT_REQUEST(1<incomplete, tag); +} + Index: accel/lib/spu/accel_dtrsm.h =================================================================== RCS file: accel/lib/spu/accel_dtrsm.h diff -N accel/lib/spu/accel_dtrsm.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_dtrsm.h 20 Aug 2008 03:57:53 -0000 1.2 @@ -0,0 +1,83 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#ifndef _ACCEL_DTRSM_H_ +#define _ACCEL_DTRSM_H_ 1 + + +extern void dtrsm_dp_128Cx16(vec_double2 *bufA, vec_double2 *bufB); + +static inline void fill_dma_list(volatile vec_uint4 *list, vec_uint4 e0, vec_uint4 stride2, vec_uint4 stride4, vec_uint4 stride6) +{ + vec_uint4 e1, e2; + + e1 = spu_add(e0, stride2); + e2 = spu_add(e0, stride4); list[0] = e0; + e0 = spu_add(e0, stride6); list[1] = e1; + e1 = spu_add(e1, stride6); list[2] = e2; + e2 = spu_add(e2, stride6); list[3] = e0; + e0 = spu_add(e0, stride6); list[4] = e1; + e1 = spu_add(e1, stride6); list[5] = e2; + e2 = spu_add(e2, stride6); list[6] = e0; + e0 = spu_add(e0, stride6); list[7] = e1; + e1 = spu_add(e1, stride6); list[8] = e2; + e2 = spu_add(e2, stride6); list[9] = e0; + e0 = spu_add(e0, stride6); list[10] = e1; + e1 = spu_add(e1, stride6); list[11] = e2; + e2 = spu_add(e2, stride6); list[12] = e0; + e0 = spu_add(e0, stride6); list[13] = e1; + e1 = spu_add(e1, stride6); list[14] = e2; + e2 = spu_add(e2, stride6); list[15] = e0; + e0 = spu_add(e0, stride6); list[16] = e1; + e1 = spu_add(e1, stride6); list[17] = e2; + e2 = spu_add(e2, stride6); list[18] = e0; + e0 = spu_add(e0, stride6); list[19] = e1; + e1 = spu_add(e1, stride6); list[20] = e2; + e2 = spu_add(e2, stride6); list[21] = e0; + e0 = spu_add(e0, stride6); list[22] = e1; + e1 = spu_add(e1, stride6); list[23] = e2; + e2 = spu_add(e2, stride6); list[24] = e0; + e0 = spu_add(e0, stride6); list[25] = e1; + e1 = spu_add(e1, stride6); list[26] = e2; + e2 = spu_add(e2, stride6); list[27] = e0; + e0 = spu_add(e0, stride6); list[28] = e1; + e1 = spu_add(e1, stride6); list[29] = e2; + e2 = spu_add(e2, stride6); list[30] = e0; + e0 = spu_add(e0, stride6); list[31] = e1; + e1 = spu_add(e1, stride6); list[32] = e2; + e2 = spu_add(e2, stride6); list[33] = e0; + e0 = spu_add(e0, stride6); list[34] = e1; + e1 = spu_add(e1, stride6); list[35] = e2; + e2 = spu_add(e2, stride6); list[36] = e0; + e0 = spu_add(e0, stride6); list[37] = e1; + e1 = spu_add(e1, stride6); list[38] = e2; + e2 = spu_add(e2, stride6); list[39] = e0; + e0 = spu_add(e0, stride6); list[40] = e1; + e1 = spu_add(e1, stride6); list[41] = e2; + e2 = spu_add(e2, stride6); list[42] = e0; + e0 = spu_add(e0, stride6); list[43] = e1; + e1 = spu_add(e1, stride6); list[44] = e2; + e2 = spu_add(e2, stride6); list[45] = e0; + e0 = spu_add(e0, stride6); list[46] = e1; + e1 = spu_add(e1, stride6); list[47] = e2; + e2 = spu_add(e2, stride6); list[48] = e0; + e0 = spu_add(e0, stride6); list[49] = e1; + e1 = spu_add(e1, stride6); list[50] = e2; + e2 = spu_add(e2, stride6); list[51] = e0; + e0 = spu_add(e0, stride6); list[52] = e1; + e1 = spu_add(e1, stride6); list[53] = e2; + e2 = spu_add(e2, stride6); list[54] = e0; + e0 = spu_add(e0, stride6); list[55] = e1; + e1 = spu_add(e1, stride6); list[56] = e2; + e2 = spu_add(e2, stride6); list[57] = e0; + e0 = spu_add(e0, stride6); list[58] = e1; + e1 = spu_add(e1, stride6); list[59] = e2; + e2 = spu_add(e2, stride6); list[60] = e0; + e0 = spu_add(e0, stride6); list[61] = e1; + list[62] = e2; + list[63] = e0; +} + +#endif /* _ACCEL_DTRSM_H_ */ Index: accel/lib/spu/accel_dtrsm_CL_B.c =================================================================== RCS file: accel/lib/spu/accel_dtrsm_CL_B.c diff -N accel/lib/spu/accel_dtrsm_CL_B.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_dtrsm_CL_B.c 22 Oct 2008 03:28:08 -0000 1.4 @@ -0,0 +1,249 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_dtrsm.h" + + +void accel_dtrsm_CL_B(hpl_accel_init_parms_t *parms, + volatile hpl_accel_dtrsm_parms_t *cmd_parms) +{ + int i; + unsigned int idx, tag, next_tag; + unsigned int size, lda, stride; + unsigned int id; + unsigned long long a, b; + unsigned int a_hi, a_lo; + unsigned int b_hi, b_lo; + unsigned int list; + unsigned int n; + unsigned int span; + vec_uint4 ld; + vec_uint4 element, stride2, stride4, stride6, next; + volatile void *lsa; +#ifdef MATRIX_4GB_CROSSING + unsigned int list_size, hi; + vec_uint4 sizes[4]; + vec_uint4 b_his[4]; +#endif +#if (HPL_ACCEL_SPES & 3) != 0 + unsigned int stride0, stride1; + vec_uint4 blk_idx, next0, next1; +#endif + + id = parms->id; + + stride2 = ((vec_uint4){0, 2*64*sizeof(double), 0, 2*64*sizeof(double)}); + stride4 = ((vec_uint4){0, 4*64*sizeof(double), 0, 4*64*sizeof(double)}); + stride6 = ((vec_uint4){0, 6*64*sizeof(double), 0, 6*64*sizeof(double)}); + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the parameters + */ + a = cmd_parms->a; + b = cmd_parms->b; + ld = cmd_parms->ld; + + lda = spu_extract(ld, 0); + + /* DMA the entire 128x128 unit lower triangle into the LS. To reduce startup + * time, we will download only the necessary data columns in groups of 16 + * while preserving the cacheline alignment. The download will be done + * starting from the smallest column to the largest. + */ + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + lsa = (volatile void *)(&bufA_128x128[0]); + size = 128*sizeof(double); + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + + for (i=0; i<127; i++) { + unsigned int adjust; + + spu_mfcdma64(lsa, a_hi, a_lo, size, 0, MFC_GET_CMD); + + a_lo += lda; + lsa += 128*sizeof(double); + + /* Compute the next DMA parameters + */ + adjust = spu_extract(spu_and(spu_cmpeq(spu_promote((i & 15), 0), 14), 16*sizeof(double)), 0); + + a_lo += adjust; + lsa += adjust; + size -= adjust; + } + + n = spu_extract(cmd_parms->dim, 0) / 16; + b_hi = mfc_ea2h(b); + b_lo = mfc_ea2l(b); + + /* Download the initial set of 16 B columns + */ + span = spu_extract(cmd_parms->blk_col, 0) + id; + stride = spu_extract(ld, 1); + b_lo += (span & 3) * 16 * sizeof(double); + MATRIX_EA_UMADD32(b_hi, b_lo, (span/4), stride); + element = spu_add(spu_shuffle(spu_splats((unsigned int)(16*sizeof(double))), spu_promote(b_lo, 0), + ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})), + ((vec_uint4){0, 0, 0, 64*sizeof(double)})); + + +#if (HPL_ACCEL_SPES & 3) != 0 + blk_idx = spu_splats(span & 3); + + stride0 = stride * (HPL_ACCEL_SPES / 4); + stride1 = stride * (1 + HPL_ACCEL_SPES / 4); + stride0 += ( HPL_ACCEL_SPES & 3)*16*(int)sizeof(double); + stride1 -= (-HPL_ACCEL_SPES & 3)*16*(int)sizeof(double); + + next0 = spu_shuffle(spu_promote(stride0, 0), spu_promote(stride0, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})); + next1 = spu_shuffle(spu_promote(stride1, 0), spu_promote(stride1, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})); +#else + stride *= HPL_ACCEL_SPES / 4; + next = spu_shuffle(spu_promote(stride, 0), + spu_promote(stride, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})); +#endif + + + list = (unsigned int)&bufB_list[0][0]; + fill_dma_list((volatile vec_uint4 *)list, element, stride2, stride4, stride6); + +#if (HPL_ACCEL_SPES & 3) != 0 + blk_idx = spu_add(blk_idx, (HPL_ACCEL_SPES & 3)); + next = spu_sel(next0, next1, spu_cmpgt(blk_idx, 3)); + blk_idx = spu_and(blk_idx, 3); +#endif + +#ifdef MATRIX_4GB_CROSSING + /* The list 4GB crossing can only occur at block boundary. Therefore, halfway through + * the list. + */ + list_size = (spu_extract(element, 1) > (0xFFFFFFFF - M_SUB*M_SUB*8)) ? (M_SUB*8) : (M*8); + + spu_mfcdma64(&bufB_128x16[0][0], b_hi, list, list_size, 0, MFC_GETL_CMD); + spu_mfcdma64(&bufB_128x16[0][list_size], b_hi+1, list+(M_SUB*8), M*8-list_size, 0, MFC_GETL_CMD); + + sizes[0] = spu_promote(list_size, 0); + b_his[0] = spu_promote(b_hi, 0); + + b_hi += spu_extract(spu_genc(element, next), 1); +#else + spu_mfcdma64(&bufB_128x16[0][0], b_hi, list, 128*8, 0, MFC_GETL_CMD); +#endif + element = spu_add(element, next); + + idx = 1; + next_tag = 0; + tag = 1; + + DMA_WAIT_REQUEST(1<<0); + + for (i=id+HPL_ACCEL_SPES; i<(int)n; i+=HPL_ACCEL_SPES) { + /* Fetch the next buffer + */ + list = (unsigned int)&bufB_list[idx][0]; + fill_dma_list((volatile vec_uint4 *)list, element, stride2, stride4, stride6); + +#if (HPL_ACCEL_SPES & 3) != 0 + blk_idx = spu_add(blk_idx, (HPL_ACCEL_SPES & 3)); + next = spu_sel(next0, next1, spu_cmpgt(blk_idx, 3)); + blk_idx = spu_and(blk_idx, 3); +#endif + +#ifdef MATRIX_4GB_CROSSING + /* The list 4GB crossing can only occur at block boundary. Therefore, halfway through + * the list. + */ + list_size = (spu_extract(element, 1) > (0xFFFFFFFF - M_SUB*M_SUB*8)) ? (M_SUB*8) : (M*8); + + spu_mfcdma64(&bufB_128x16[tag][0], b_hi, list, list_size, tag, MFC_GETLB_CMD); + spu_mfcdma64(&bufB_128x16[tag][list_size], b_hi+1, list+(M_SUB*8), M*8-list_size, tag, MFC_GETL_CMD); + + sizes[idx] = spu_promote(list_size, 0); + b_his[idx] = spu_promote(b_hi, 0); + + b_hi += spu_extract(spu_genc(element, next), 1); +#else + spu_mfcdma64(&bufB_128x16[tag][0], b_hi, list, 128*8, tag, MFC_GETLB_CMD); +#endif + element = spu_add(element, next); + + tag ^= 1; + + /* Wait for the previous get to complete */ + DMA_WAIT_RECEIVE(); + + /* Perform the dtrsm. + */ + dtrsm_dp_128Cx16(&bufA[0][0], &bufB_128x16[tag][0]); + + idx = (idx + 1) & 3; + + /* Store the update matrix columns back to memory + */ + list = (unsigned int)&bufB_list[idx^2][0]; +#ifdef MATRIX_4GB_CROSSING + list_size = spu_extract(sizes[idx^2], 0); + + hi = spu_extract(b_his[idx^2], 0); + + spu_mfcdma64(&bufB_128x16[tag][0], hi, list, list_size, tag, MFC_PUTL_CMD); + spu_mfcdma64(&bufB_128x16[tag][list_size], hi+1, list+(M_SUB*8), M*8-list_size, tag, MFC_PUTL_CMD); +#else + spu_mfcdma64(&bufB_128x16[tag][0], b_hi, list, 128*8, tag, MFC_PUTL_CMD); +#endif + + next_tag = tag ^ 1; + + DMA_WAIT_REQUEST(1<incomplete, tag); +} + Index: accel/lib/spu/accel_dtrsm_dp_128Cx16.S =================================================================== RCS file: accel/lib/spu/accel_dtrsm_dp_128Cx16.S diff -N accel/lib/spu/accel_dtrsm_dp_128Cx16.S --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_dtrsm_dp_128Cx16.S 23 Oct 2008 21:20:24 -0000 1.3 @@ -0,0 +1,2270 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +/* + * SYNOPSIS: + * void dtrsm_dp_128Cx16(vec_double2 *bufA, vec_double2 *bufB) + * + * DESCRIPTION: + * This file contains a specialized DTRSM function that solves + * the matrix equation for [x]. + * + * [a]*[x] = [b] + * + * where: + * [a] is a unit lower, column ordered, double precision, little endian triangle + * matrix of 128 rows by 128 columns. + * [b] is a row ordered, double precision, matrix of 128 rows and 16 columns. + * The solution [x] is returned in [b]. + * + * This implementation is a highly optimized solution that mimics the following + * scalar design that processes 4 rows of b at a time: + * + * for (i=0; i<128; i+=4) { # iloop + * for (x=0; x +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_dtrsm.h" + +void accel_dtrsm_panel(hpl_accel_init_parms_t *parms, + volatile hpl_accel_dtrsm_parms_t *cmd_parms) +{ + int i; + unsigned int idx, tag, next_tag; + unsigned int size, lda, stride; + unsigned int id; + unsigned long long a, b, c; + unsigned int a_hi, a_lo; + unsigned int b_hi, b_lo; + unsigned int c_hi, c_lo; + unsigned int list; + unsigned int n; + unsigned int span; + vec_uint4 ld; + vec_uint4 element, stride2, stride4, stride6; + vec_uint4 elementc, nextc, stride2c, stride4c, stride6c; + volatile void *lsa; +#ifdef MATRIX_4GB_CROSSING + unsigned int list_size; +#endif +#if (HPL_ACCEL_SPES & 3) != 0 + unsigned int stride0c, stride1c; + vec_uint4 blk_idx, next0c, next1c; +#endif + + id = parms->id; + + stride2c = ((vec_uint4){0, 2*64*sizeof(double), 0, 2*64*sizeof(double)}); + stride4c = ((vec_uint4){0, 4*64*sizeof(double), 0, 4*64*sizeof(double)}); + stride6c = ((vec_uint4){0, 6*64*sizeof(double), 0, 6*64*sizeof(double)}); + + elementc = (vec_uint4){0}; /* included just to eliminate a warning */ + nextc = (vec_uint4){0}; /* included just to eliminate a warning */ + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the parameters + */ + a = cmd_parms->a; + b = cmd_parms->b; + c = cmd_parms->c; + ld = cmd_parms->ld; + + lda = spu_extract(ld, 0); + + /* DMA the entire 128x128 unit lower triangle into the LS. To reduce startup + * time, we will download only the necessary data columns in groups of 16 + * while preserving the cacheline alignment. The download will be done + * starting from the smallest column to the largest. + */ + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + lsa = (volatile void *)(&bufA_128x128[0]); + size = 128*sizeof(double); + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + + for (i=0; i<127; i++) { + unsigned int adjust; + + spu_mfcdma64(lsa, a_hi, a_lo, size, 0, MFC_GET_CMD); + + a_lo += lda; + lsa += 128*sizeof(double); + + /* Compute the next DMA parameters + */ + adjust = spu_extract(spu_and(spu_cmpeq(spu_promote((i & 15), 0), 14), 16*sizeof(double)), 0); + + a_lo += adjust; + lsa += adjust; + size -= adjust; + } + + n = spu_extract(cmd_parms->dim, 0) / 16; + b_hi = mfc_ea2h(b); + b_lo = mfc_ea2l(b); + + b_lo += 16 * sizeof(double) * id; + + /* Download the initial set of 16 B columns + */ + stride = spu_extract(ld, 1); + + element = spu_add(spu_shuffle(spu_splats((unsigned int)(16*sizeof(double))), + spu_promote(b_lo, 0), + ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})), + spu_rlmaskqwbyte(spu_promote(stride, 0), -12)); + + stride2 = spu_sl(spu_shuffle(ld, ld, ((vec_uchar16){128,128,128,128, 4,5,6,7, 128,128,128,128, 4,5,6,7})), 1); + stride4 = spu_add(stride2, stride2); + stride6 = spu_add(stride2, stride4); + + fill_dma_list(&bufB_list[0][0], element, stride2, stride4, stride6); + spu_mfcdma64(&bufB_128x16[0][0], b_hi, (unsigned int)(&bufB_list[0][0]), 128*8, 0, MFC_GETL_CMD); + + + c_hi = mfc_ea2h(c); + c_lo = mfc_ea2l(c); + + span = spu_extract(cmd_parms->blk_col, 0) + id; + stride = spu_extract(ld, 2); + c_lo += (span & 3) * 16 * sizeof(double); + MATRIX_EA_UMADD32(c_hi, c_lo, (span/4), stride); + elementc = spu_add(spu_shuffle(element, spu_promote(c_lo, 0), + ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})), + ((vec_uint4){0, 0, 0, 64*sizeof(double)})); + +#if (HPL_ACCEL_SPES & 3) != 0 + blk_idx = spu_splats(span & 3); + + stride0c = stride * (HPL_ACCEL_SPES / 4); + stride1c = stride * (1 + HPL_ACCEL_SPES / 4); + stride0c += ( HPL_ACCEL_SPES & 3)*16*(int)sizeof(double); + stride1c -= (-HPL_ACCEL_SPES & 3)*16*(int)sizeof(double); + + next0c = spu_shuffle(spu_promote(stride0c, 0), spu_promote(stride0c, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})); + next1c = spu_shuffle(spu_promote(stride1c, 0), spu_promote(stride1c, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})); +#else + stride *= HPL_ACCEL_SPES / 4; + nextc = spu_shuffle(spu_promote(stride, 0), + spu_promote(stride, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})); +#endif + + idx = 1; + next_tag = 0; + tag = 1; + + DMA_WAIT_REQUEST(1<<0); + + for (i=id+HPL_ACCEL_SPES; i<(int)n; i+=HPL_ACCEL_SPES) { + /* Fetch the next buffer + */ + element = spu_add(element, ((vec_uint4){0, HPL_ACCEL_SPES*16*sizeof(double), 0, HPL_ACCEL_SPES*16*sizeof(double)})); + + fill_dma_list(&bufB_list[idx][0], element, stride2, stride4, stride6); + spu_mfcdma64(&bufB_128x16[tag][0], b_hi, (unsigned int)(&bufB_list[idx][0]), 128*8, tag, MFC_GETLB_CMD); + tag ^= 1; + + /* Wait for the previous get to complete */ + DMA_WAIT_RECEIVE(); + + /* Perform the dtrsm. + */ + dtrsm_dp_128Cx16(&bufA[0][0], &bufB_128x16[tag][0]); + + idx = (idx + 1) & 3; + + /* Store the results back to system memory in c + * Construct the display list to store to the blocked formated C matrix. + */ + list = (unsigned int)(&bufB_list[idx+4][0]); + fill_dma_list((volatile vec_uint4 *)list, elementc, stride2c, stride4c, stride6c); + +#if (HPL_ACCEL_SPES & 3) != 0 + blk_idx = spu_add(blk_idx, (HPL_ACCEL_SPES & 3)); + nextc = spu_sel(next0c, next1c, spu_cmpgt(blk_idx, 3)); + blk_idx = spu_and(blk_idx, 3); +#endif + +#ifdef MATRIX_4GB_CROSSING + /* The list 4GB crossing can only occur at block boundary. Therefore, halfway through + * the list. + */ + list_size = (spu_extract(elementc, 1) > (0xFFFFFFFF - M_SUB*M_SUB*8)) ? (M_SUB*8) : (M*8); + + spu_mfcdma64(&bufB_128x16[tag][0], c_hi, list, list_size, tag, MFC_PUTL_CMD); + spu_mfcdma64(&bufB_128x16[tag][list_size], c_hi+1, list+(M_SUB*8), M*8-list_size, tag, MFC_PUTL_CMD); + + c_hi += spu_extract(spu_genc(elementc, nextc), 1); +#else + spu_mfcdma64(&bufB_128x16[tag][0], c_hi, list, 128*8, tag, MFC_PUTL_CMD); +#endif + elementc = spu_add(elementc, nextc); + + next_tag = tag ^ 1; + + DMA_WAIT_REQUEST(1< (0xFFFFFFFF - M_SUB*M_SUB*8)) ? (M_SUB*8) : (M*8); + + spu_mfcdma64(&bufB_128x16[next_tag][0], c_hi, list, list_size, tag, MFC_PUTL_CMD); + spu_mfcdma64(&bufB_128x16[next_tag][list_size], c_hi+1, list+(M_SUB*8), M*8-list_size, tag, MFC_PUTL_CMD); + + c_hi += spu_extract(spu_genc(elementc, nextc), 1); +#else + spu_mfcdma64(&bufB_128x16[next_tag][0], c_hi, list, 128*8, tag, MFC_PUTL_CMD); +#endif + + elementc = spu_add(elementc, nextc); + + /* Report completion status if requested. + */ + report_completion(id, cmd_parms->incomplete, tag); +} + Index: accel/lib/spu/accel_mm_dp.c =================================================================== RCS file: accel/lib/spu/accel_mm_dp.c diff -N accel/lib/spu/accel_mm_dp.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_mm_dp.c 20 Aug 2008 03:57:53 -0000 1.8 @@ -0,0 +1,289 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include + + +/* Compute generalized matrix multiply of the form + * + * [C] -= [A] * [B] + * + * where + * C is a row ordered matrix of dimension n by m (width by height) elements + * with leading dimension n. + * A is a row ordered matrix of dimension k by m elements with leading + * dimension k. + * B is a row ordered matrix of dimension n by k elements with leading + * dimenstion n. + * + * The computation is performed by computing the result using sub-block of + * the size 8x4 for B and C, and 4x4 for A. + * + * This blocking mandates that k and m must be an integral multiple of 4 and + * n must be an integral multiple of 8. + * + * NOTE: The leading dimensions are a double stride, not a vector stride. + */ + +void mm_dp(int k, int m, int n, vector double *c, vector double *a, vector double *b) +{ + int i, x, y; + vector unsigned int pA, pB, pC; + vector unsigned int pA_start, pA_row, pB_start, pC_start; + vector unsigned int n1, n32, k1, k32; + vector double *pA0, *pA1, *pA2, *pA3; + vector double *pB0, *pB1, *pB2, *pB3; + vector double *pC0, *pC1, *pC2, *pC3; + vector double A00, A01, A10, A11, A20, A21, A30, A31; + vector double A00_0, A10_0, A20_0, A30_0; + vector double A00_1, A10_1, A20_1, A30_1; + vector double A01_0, A11_0, A21_0, A31_0; + vector double A01_1, A11_1, A21_1, A31_1; + vector double B00, B01, B02, B03; + vector double B10, B11, B12, B13; + vector double B20, B21, B22, B23; + vector double B30, B31, B32, B33; + vector double C00, C01, C02, C03; + vector double C10, C11, C12, C13; + vector double C20, C21, C22, C23; + vector double C30, C31, C32, C33; + vector unsigned int v_0123 = (vector unsigned int){0, 8, 16, 24}; + vector unsigned int n_0123, k_0123; + vector unsigned char pat0 = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; + vector unsigned char pat1; + + pat1 = spu_or(pat0, 8); + + /* Precompute 4 local store pointers for each of the buffer pointers + * + * pA_start = a+0*k, a+1*k, a+2*k, a+3*k + * pB_start = b+0*n, b+1*n, a+2*n, a+3*n + * pC_start = c+0*n, c+1*n, a+2*n, a+3*n + * + * where a, b, c are double pointers. + */ + k1 = spu_splats((unsigned int)k); + n1 = spu_splats((unsigned int)n); + + k_0123 = spu_mulo((vector unsigned short)k1, (vector unsigned short)v_0123); + n_0123 = spu_mulo((vector unsigned short)n1, (vector unsigned short)v_0123); + pA_start = spu_add(spu_splats((unsigned int)a), k_0123); + pB_start = spu_add(spu_splats((unsigned int)b), n_0123); + pC_start = spu_add(spu_splats((unsigned int)c), n_0123); + + n32 = spu_sl(n1, 5); + k32 = spu_sl(k1, 5); + + for (x=0; x +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_reform.h" + +void accel_reform_matrix_CL_to_B(hpl_accel_init_parms_t *parms, + volatile hpl_accel_reform_matrix_CL_to_B_parms_t *cmd_parms) +{ + int i; + unsigned int x, y; + unsigned int id; + unsigned long long a, scratch; + unsigned int a_hi, a_lo, out_hi, out_lo; + unsigned int scratch_hi, scratch_lo, lo; + unsigned int n, nb, m, mb, m_pad, lda, spes, trailing, left; + unsigned int dst_idx; + unsigned int tag, next_tag; + unsigned int retained; /* Number of buffers kept in local store instead of the scratch buffer */ + vec_uint4 next_col_blk, next_row_blk; + vec_uint4 element0, element1, element2, element3, element_next; + volatile vec_uint4 *list; + vec_uint4 mask_0101 = (vec_uint4){0,-1,0,-1}; +#ifdef ACCEL_LITTLE_ENDIAN + vec_uchar16 pat_even = (vec_uchar16){7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; +#else + vec_uchar16 pat_even = (vec_uchar16){0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23}; +#endif + vec_uchar16 pat_odd = spu_or(pat_even, 8); + vec_uchar16 pat_z0z0 = (vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3}; + vec_uchar16 pat_zzzz = (vec_uchar16){128,128,128,128, 128,128,128,128, 128,128,128,128, 128,128,128,128}; + vec_double2 *srcTop, *srcBot, *dst, *buf; + vec_double2 a0, a1, a2, a3, a4, a5, a6, a7; +#ifdef MATRIX_4GB_CROSSING + unsigned int in_hi; + vec_uint4 carry; +#endif + + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + + /* Fetch the parameters + */ + a = cmd_parms->a; + scratch = cmd_parms->scratch; + lda = cmd_parms->lda; + m = cmd_parms->m; + n = cmd_parms->n; + spes = cmd_parms->spes; + + /* Pad m and n to the nearest block and compute the number of blocks to be + * reformated. Rows are padded with 0.0. Columns to filled in with don't care + * values. + */ + m_pad = (m % M_SUB) - 1; + mb = (m + M_SUB-1)/M_SUB; + + nb = (n + M_SUB-1)/M_SUB; + + /* Compute the amount of trailing data to zero after the blocked data. + */ + trailing = (lda - mb*M_SUB*sizeof(double))*M_SUB; + + + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + MATRIX_EA_UMADD32(a_hi, a_lo, lda, id*M_SUB); + +#ifdef MATRIX_4GB_CROSSING + in_hi = a_hi; +#endif + + scratch_hi = mfc_ea2h(scratch); + scratch_lo = mfc_ea2l(scratch); + + scratch_lo += id*(mb-4)*M_SUB*M_SUB*sizeof(double); + + /* Compute all the working variables needed to generate that DMA lists. + * + * element0 = {M_SUB*sizeof(double), a_lo + 0*lda, M_SUB*sizeof(double), a_lo + 1*lda} + * element1 = {M_SUB*sizeof(double), a_lo + 2*lda, M_SUB*sizeof(double), a_lo + 3*lda} + * element2 = {M_SUB*sizeof(double), a_lo + 4*lda, M_SUB*sizeof(double), a_lo + 5*lda} + * element3 = {M_SUB*sizeof(double), a_lo + 6*lda, M_SUB*sizeof(double), a_lo + 7*lda} + * element_next = { 0, 8*lda, 0, 8*lda} + */ + next_col_blk = spu_splats((unsigned int)(M_SUB*sizeof(double))); + next_row_blk = spu_and(spu_splats(lda*M_SUB*spes - mb*M_SUB*sizeof(double)), mask_0101); + + element_next = spu_promote(8*lda, 0); + element_next = spu_shuffle(element_next, element_next, pat_z0z0); + + element0 = spu_add(spu_rlmaskqwbyte(spu_rlmask(element_next, -3), -8), + spu_sel(spu_splats((unsigned int)(M_SUB*sizeof(double))), spu_splats(a_lo), mask_0101)); + + next_col_blk = spu_and(next_col_blk, mask_0101); + + element1 = spu_rlmask(element_next, -2); + element2 = spu_rlmask(element_next, -1); + element3 = spu_add(spu_add(element1, element2), element0); + element1 = spu_add(element1, element0); + element2 = spu_add(element2, element0); + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT(-1); + + /* Reformat the blocks + */ + tag = 0; + + for (x=id; x 1) { + if (y != mb-2) { + /* If this is the next to last block of the column, then do not put into + * the scratch buffer. + */ + dst = &bufB[dst_idx][0]; + spu_mfcdma64(dst, scratch_hi, lo, 16384, tag, MFC_PUT_CMD); + spu_mfcdma64(dst + 1024, scratch_hi, lo+16384, 16384, tag, MFC_PUT_CMD); + lo += M_SUB*M_SUB*sizeof(double); + } + dst_idx ^= 1; + } else { + dst_idx++; + retained++; + } + + tag = next_tag; + } + + /* Wait for the final block get before putting reformated blocks back into the + * matrix. + */ + DMA_WAIT(1< 3) { + srcTop = &bufB[dst_idx^1][0]; + spu_mfcdma64(srcTop, out_hi, out_lo, 16384, tag^1, MFC_PUT_CMD); + spu_mfcdma64(srcTop + 1024, out_hi, out_lo+16384, 16384, tag^1, MFC_PUT_CMD); + MATRIX_EA_UADD32(out_hi, out_lo, 32768); + } + + + /* Finish reformating the last block. The last block contains special handling + * code to zeros out the pad rows. + */ + srcTop = &bufA[tag][16*M_SUB/2]; + srcBot = &bufA[tag][48*M_SUB/2]; + dst = &bufB[dst_idx][0]; + + for (i=0; i<64; i+=2) { + vec_uchar16 pat_e, pat_o; + + pat_e = spu_sel(pat_even, pat_zzzz, + spu_maskb(spu_extract(spu_cmpgt(spu_promote((unsigned int)i, 0), spu_promote(m_pad, 0)), 0))); + pat_o = spu_sel(pat_odd, pat_zzzz, + spu_maskb(spu_extract(spu_cmpgt(spu_promote((unsigned int)(i+1), 0), spu_promote(m_pad, 0)), 0))); + REFORM_8(dst, srcTop, -16, 0, pat_e, pat_o); + REFORM_8(dst, srcTop, -8, 4, pat_e, pat_o); + REFORM_8(dst, srcTop, 0, 8, pat_e, pat_o); + REFORM_8(dst, srcTop, 8, 12, pat_e, pat_o); + + REFORM_8(dst, srcBot, -16, 16, pat_e, pat_o); + REFORM_8(dst, srcBot, -8, 20, pat_e, pat_o); + REFORM_8(dst, srcBot, 0, 24, pat_e, pat_o); + REFORM_8(dst, srcBot, 8, 28, pat_e, pat_o); + + srcTop += 1; + srcBot += 1; + dst += 2*M_SUB/2; + } + + /* Store the final block back into the matrix. + */ + spu_mfcdma64(&bufB[dst_idx][0], out_hi, out_lo, 16384, tag, MFC_PUT_CMD); + spu_mfcdma64(&bufB[dst_idx][1024], out_hi, out_lo+16384, 16384, tag, MFC_PUT_CMD); + + /* Zero out final trailing data resulting from lda striding. + */ + MATRIX_EA_UADD32(out_hi, out_lo, 32768); + + left = trailing; + while (left) { + unsigned int size; + +#ifndef MFC_SDCRZ_CMD +#define MFC_SDCRZ_CMD 0x0089 /* SPU Only */ +#endif /* MFC_SDCRZ_CMD */ + + size = (left > 16384) ? 16384 : left; + spu_mfcdma64(0, out_hi, out_lo, size, tag, MFC_SDCRZ_CMD); + + MATRIX_EA_UADD32(out_hi, out_lo, size); + left -= size; + } + + /* Advance pointers to next column to be processed. + */ +#ifdef MATRIX_4GB_CROSSING + in_hi += spu_extract(spu_genc(element0, next_row_blk), 1); +#endif + element0 = spu_add(element0, next_row_blk); + element1 = spu_add(element1, next_row_blk); + element2 = spu_add(element2, next_row_blk); + element3 = spu_add(element3, next_row_blk); + + MATRIX_EA_UMADD32(a_hi, a_lo, lda, spes*M_SUB); + + /* Wait for all the transfers except the final block to complete + */ + DMA_WAIT(1<<(tag^1)); + } + + /* Report completion status if requested. + */ + report_completion(id, cmd_parms->incomplete, tag); +} + Index: accel/lib/spu/accel_reform_panel_B_to_CL.c =================================================================== RCS file: accel/lib/spu/accel_reform_panel_B_to_CL.c diff -N accel/lib/spu/accel_reform_panel_B_to_CL.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_reform_panel_B_to_CL.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,247 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_reform.h" + + +void accel_reform_panel_B_to_CL(hpl_accel_init_parms_t *parms, + volatile hpl_accel_reform_panel_parms_t *cmd_parms) +{ + int i; + unsigned int id; + int x, y, columns, rows; + int dma_size1, dma_size2; + unsigned long long a, panel; + unsigned int a_hi, a_lo, hi, lo; + unsigned int panel_hi, panel_lo; + unsigned int lda, ldp; + unsigned int n, m, mb; + unsigned int tag, next_tag; + unsigned int addend; +#ifdef ACCEL_LITTLE_ENDIAN + vec_uchar16 pat_even = (vec_uchar16){7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; +#else + vec_uchar16 pat_even = (vec_uchar16){0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23}; +#endif + vec_uchar16 pat_odd = spu_or(pat_even, 8); + vec_uchar16 pat_z0z0 = (vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3}; + vec_uint4 e0, e1, e2, e3, esize; + vec_uint4 element0, element1, element2, element3, element_next; + vec_uint4 next_col_blk, next_row_blk; + vec_uint4 mask_0101 = (vec_uint4){0,-1,0,-1}; + vec_double2 a0, a1, a2, a3, a4, a5, a6, a7; + vec_double2 *srcTop, *srcBot, *dst; + volatile vec_uint4 *list; + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the parameters + */ + a = cmd_parms->a; + panel = cmd_parms->panel; + lda = cmd_parms->lda; + ldp = cmd_parms->ldp; + m = cmd_parms->m; + n = cmd_parms->n; + + mb = (m + (M_SUB-1)) / M_SUB; + + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + addend = id * (M_SUB * M_SUB * sizeof(double)); + MATRIX_EA_UADD32(a_hi, a_lo, addend); + + panel_hi = mfc_ea2h(panel); + panel_lo = mfc_ea2l(panel); + + panel_lo += id * M_SUB * sizeof(double); + + /* Compute all the working variables needed to generate the DMA lists. + * + * element0 = {M_SUB*sizeof(double), panel_lo + 0*ldp, M_SUB*sizeof(double), panel_lo + 1*ldp} + * element1 = {M_SUB*sizeof(double), panel_lo + 2*ldp, M_SUB*sizeof(double), panel_lo + 3*ldp} + * element2 = {M_SUB*sizeof(double), panel_lo + 4*ldp, M_SUB*sizeof(double), panel_lo + 5*ldp} + * element3 = {M_SUB*sizeof(double), panel_lo + 6*ldp, M_SUB*sizeof(double), panel_lo + 7*ldp} + * element_next = { 0, 8*ldp, 0, 8*ldp} + */ + next_col_blk = spu_and(spu_splats(ldp*M_SUB), mask_0101); + next_row_blk = spu_and(spu_splats(HPL_ACCEL_REFORM_SPES*M_SUB*sizeof(double)), mask_0101); + + element_next = spu_promote(8*ldp, 0); + element_next = spu_shuffle(element_next, element_next, pat_z0z0); + + element0 = spu_add(spu_rlmaskqwbyte(spu_rlmask(element_next, -3), -8), + spu_sel(spu_splats((unsigned int)(M_SUB*sizeof(double))), spu_splats(panel_lo), mask_0101)); + + element1 = spu_rlmask(element_next, -2); + element2 = spu_rlmask(element_next, -1); + element3 = spu_add(spu_add(element1, element2), element0); + element1 = spu_add(element1, element0); + element2 = spu_add(element2, element0); + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + + /* Reformat the blocks + */ + tag = 0; + + /* Fetch the first block + */ + if (id < mb) { + dma_size1 = (int)m - id*M_SUB; + dma_size2 = dma_size1-32; + dma_size1 = clamp_0_32(dma_size1); + dma_size2 = clamp_0_32(dma_size2); + rows = (dma_size1 + dma_size2) / 2; + esize = spu_promote(rows * sizeof(vec_double2), 0); + + spu_mfcdma64(&bufA[0][0], a_hi, a_lo, (unsigned int)dma_size1*M_SUB*sizeof(double), 0, MFC_GET_CMD); + spu_mfcdma64(&bufA[0][1024], a_hi, a_lo+16384, (unsigned int)dma_size2*M_SUB*sizeof(double), 0, MFC_GET_CMD); + } + + /* For each of the row of blocks. + */ + for (y=id; y<(int)mb; ) { + hi = a_hi; + lo = a_lo; + MATRIX_EA_UADD32(hi, lo, lda); + + e0 = element0; + e1 = element1; + e2 = element2; + e3 = element3; + + for (x=0; x<(int)n-M_SUB; x+=M_SUB) { + next_tag = tag ^ 1; + + /* Fetch the next block. + */ + spu_mfcdma64(&bufA[next_tag][0], hi, lo, 16384, next_tag, MFC_GET_CMD); + spu_mfcdma64(&bufA[next_tag][1024], hi, lo+16384, 16384, next_tag, MFC_GET_CMD); + MATRIX_EA_UADD32(hi, lo, lda); + + DMA_WAIT(1<incomplete, tag^1); +} Index: accel/lib/spu/accel_reform_panel_R_to_B.c =================================================================== RCS file: accel/lib/spu/accel_reform_panel_R_to_B.c diff -N accel/lib/spu/accel_reform_panel_R_to_B.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_reform_panel_R_to_B.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,153 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_reform.h" + + +void accel_reform_panel_R_to_B(hpl_accel_init_parms_t *parms, + volatile hpl_accel_reform_panel_parms_t *cmd_parms) +{ + int i, x, y; + unsigned int id; + unsigned int idx; + unsigned int a_hi, a_lo, hi, lo; + unsigned long long a, panel; + unsigned int panel_hi, panel_lo, p_lo; + unsigned int tag; + unsigned int lda, ldp; + unsigned int n, m, row_len, size, left, esize, extra; + unsigned int *list, list_offset; + unsigned int addend; + vec_double2 *buf; +#ifdef MATRIX_4GB_CROSSING + unsigned int carry; +#endif + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + + /* Fetch the parameters + */ + a = cmd_parms->a; + panel = cmd_parms->panel; + lda = cmd_parms->lda; + ldp = cmd_parms->ldp; + m = cmd_parms->m; + n = cmd_parms->n; + + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + panel_hi = mfc_ea2h(panel); + panel_lo = mfc_ea2l(panel); + + addend = id * (M_SUB * sizeof(double)); + + MATRIX_EA_UADD32(a_hi, a_lo, addend); + panel_lo += id * ldp; + + row_len = (n&~1)*sizeof(double); + extra = (n&1)*sizeof(double); + + tag = 0; + list_offset = 0; + size = 0; + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT(-1); + + /* For each row + */ + for (y=(int)id; y<(int)m; y+=HPL_ACCEL_SPES) { + /* For each portion of the row in 16K chunks + */ + hi = a_hi; + lo = a_lo; + esize = lda; + + p_lo = panel_lo; + + for (x=0; x<(int)row_len; x+=(int)size) { + + left = row_len - (unsigned int)x; + size = (left < 16384) ? left : 16384; + + buf = &bufA[tag][0]; + spu_mfcdma64(buf, panel_hi, p_lo, size, tag, MFC_GET_CMD); + + p_lo += size; + + /* Construct a list for the placement into blocked format. + */ + list = (unsigned int *)(&bufB[0][0] + list_offset); + for (i=0, idx=0; i<(int)size; i+=(int)M_SUB*sizeof(double)) { + esize = size - i; + if (esize > M_SUB*sizeof(double)) esize = M_SUB*sizeof(double); + list[idx+0] = esize; + list[idx+1] = lo; + idx += 2; +#ifdef MATRIX_4GB_CROSSING + carry = spu_extract(spu_genc(spu_promote(lo, 0), spu_promote(lda, 0)), 0); + /* If we cross a 4GB boundary, flush the list and start a new one. + */ + if (carry) { + spu_mfcdma64(buf, hi, (unsigned int)list, 4*idx, tag, MFC_PUTLB_CMD); + buf += (M_SUB/4)*idx;; + list += idx; + idx = 0; + hi += carry; + } +#endif + lo += lda; + } + spu_mfcdma64(buf, hi, (unsigned int)list, 4*idx, tag, MFC_PUTLB_CMD); + spu_mfcdma32(0, 0, 0, tag, MFC_BARRIER_CMD); + + /* Advance pointers to next row or buffer + */ + list_offset = (list_offset + 16) % (128*16); /* accomodate up to 128 enqueued DMAs */ + tag ^= 1; + } + + /* Handle the final odd column values + */ + if (extra) { + buf = &bufA[tag][0]; + if (size & (M_SUB*sizeof(double)-1)) { + addend = esize - lda; + MATRIX_EA_ADD32(hi, lo, addend); + } + spu_mfcdma64(buf, panel_hi, p_lo, extra, tag, MFC_GET_CMD); + spu_mfcdma64(buf, hi, lo, extra, tag, MFC_PUTB_CMD); + tag ^= 1; + } + + /* Advance pointers to the next row */ + addend = M_SUB*sizeof(double)*HPL_ACCEL_SPES; + MATRIX_EA_UADD32(a_hi, a_lo, addend); + panel_lo += ldp * HPL_ACCEL_SPES; + + } + /* Wait for next to last DMA to complete before posting completion. + */ + DMA_WAIT(1<incomplete, tag^1); +} + Index: accel/lib/spu/accel_reform_rows_B_to_R.c =================================================================== RCS file: accel/lib/spu/accel_reform_rows_B_to_R.c diff -N accel/lib/spu/accel_reform_rows_B_to_R.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_reform_rows_B_to_R.c 22 Oct 2008 03:28:08 -0000 1.3 @@ -0,0 +1,166 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_reform.h" + + +static inline void row_B_to_R(unsigned int src_hi, unsigned int src_lo, int ld_src, + unsigned int dst_hi, unsigned int dst_lo, + unsigned int skip, unsigned int left, void *buf) +{ + unsigned int src_size, dst_size, size; + void *ptr; + + dst_size = 16*1024 - skip; + src_size = (M_SUB*sizeof(double)) - skip; + if (dst_size > left) dst_size = left; + if (src_size > left) src_size = left; + + while (left) { + /* Fetch (up to) 16KB buffer of M_SUB spans */ + spu_mfcdma64(buf, src_hi, src_lo+skip, src_size, 0, MFC_GETB_CMD); + ptr = buf; + + skip = 0; + size = dst_size; + left -= dst_size; + + while ((size -= src_size)) { + ptr += src_size; + MATRIX_EA_UADD32(src_hi, src_lo, ld_src); + src_size = (size > (M_SUB*sizeof(double))) ? M_SUB*sizeof(double) : size; + + spu_mfcdma64(ptr, src_hi, src_lo, src_size, 0, MFC_GET_CMD); + } + + /* Store the 16KB span into the row buffer */ + spu_mfcdma64(buf, dst_hi, dst_lo, dst_size, 0, MFC_PUTB_CMD); + + MATRIX_EA_UADD32(src_hi, src_lo, ld_src); + dst_lo += dst_size; + dst_size = (left > 16*1024) ? 16*1024 : left; + src_size = (dst_size > (M_SUB*sizeof(double))) ? M_SUB*sizeof(double) : dst_size; + } +} + + + +void accel_reform_rows_B_to_R(hpl_accel_init_parms_t *parms, + volatile hpl_accel_reform_rows_parms_t *cmd_parms) +{ + int i; + int m, n, ldr, lda; + int row; + unsigned int id; + unsigned int a_hi, a_lo, r_hi, r_lo; + unsigned int blk_col, skip, mask; + unsigned int spans, spans_per_spe, extra_spans, start_span, end_span; + unsigned int start_col, end_col, max_end_col; + unsigned int row_size; + vector signed int m_n_ldr_lda; + vector unsigned long long rows_a, incomplete_blk_col; + void *buf; + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the parameters + */ + m_n_ldr_lda = cmd_parms->m_n_ldr_lda; + rows_a = cmd_parms->rows_a; + incomplete_blk_col = cmd_parms->incomplete_blk_col; + + m = spu_extract(m_n_ldr_lda, 0); + n = spu_extract(m_n_ldr_lda, 1); + ldr = spu_extract(m_n_ldr_lda, 2); + lda = spu_extract(m_n_ldr_lda, 3); + + blk_col = spu_extract((vector unsigned int)incomplete_blk_col, 2); + + r_hi = spu_extract((vector unsigned int)rows_a, 0); + r_lo = spu_extract((vector unsigned int)rows_a, 1); + + a_hi = spu_extract((vector unsigned int)rows_a, 2); + a_lo = spu_extract((vector unsigned int)rows_a, 3); + + buf = bufA; + + skip = (blk_col % M_SUB) * sizeof(double); + + blk_col /= M_SUB; + MATRIX_EA_UMADD32(a_hi, a_lo, blk_col, lda); + + /* Equally assign complete rows to each of the SPEs. + */ + row_size = n*sizeof(double); + + /* Process remaining rows by assigning each row to groups of HPL_ACCEL_SPES SPEs. + * Compute the spanning parameters assigned to this SPE. + */ + spans = (row_size + skip + (M_SUB-1)*sizeof(double)) / (M_SUB * sizeof(double)); + spans_per_spe = spans / HPL_ACCEL_SPES; + extra_spans = spans % HPL_ACCEL_SPES; + + start_span = id * spans_per_spe + ((id > extra_spans) ? extra_spans : id); + end_span = start_span + spans_per_spe - spu_extract(spu_cmpgt(spu_promote(extra_spans, 0), spu_promote(id, 0)), 0); + + if (end_span > start_span) { + start_col = start_span * (M_SUB * sizeof(double)); + end_col = end_span * (M_SUB * sizeof(double)); + + max_end_col = skip + row_size; + + mask = spu_extract(spu_cmpeq(spu_promote(id, 0), 0), 0); + r_lo += start_col - (skip & ~mask); + MATRIX_EA_UMADD32(a_hi, a_lo, start_span, lda); + + skip &= mask; + + start_col += skip; + end_col = (end_col > max_end_col) ? max_end_col : end_col; + + row_size = end_col - start_col; + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + for (i=0; iblk_rows[i]; + + hi = a_hi; + lo = a_lo; + EA_UADD64(hi, lo, (unsigned int)row >> (32-9), (unsigned int)row << 9); + row_B_to_R(hi, lo, lda, r_hi, r_lo + (i*ldr), skip, row_size, buf); +#else + row = cmd_parms->blk_rows[i]; + row_B_to_R(a_hi, a_lo + (row * (M_SUB * sizeof(double))), lda, r_hi, r_lo + (i*ldr), skip, row_size, buf); +#endif + } + } else { + DMA_WAIT_RECEIVE(); + } + + /* Report completion status if requested. + */ + report_completion(id, spu_extract(incomplete_blk_col, 0), 0); +} + + + Index: accel/lib/spu/accel_reform_rows_R_to_B.c =================================================================== RCS file: accel/lib/spu/accel_reform_rows_R_to_B.c diff -N accel/lib/spu/accel_reform_rows_R_to_B.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_reform_rows_R_to_B.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,164 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_reform.h" + + +static inline void row_R_to_B(unsigned int src_hi, unsigned int src_lo, + unsigned int dst_hi, unsigned int dst_lo, int ld_dst, + unsigned int skip, unsigned int left, void *buf) +{ + unsigned int src_size, dst_size; + void *ptr; + + + src_size = 16*1024 - skip; + dst_size = (M_SUB*sizeof(double)) - skip; + if (src_size > left) src_size = left; + if (dst_size > left) dst_size = left; + + while (left) { + /* Fetch a big (16KB) span from the row buffer */ + + spu_mfcdma64(buf, src_hi, src_lo, src_size, 0, MFC_GETB_CMD); + + left -= src_size; + src_lo += src_size; + + /* Store the big span into the matrix in M_SUB element spans */ + spu_mfcdma64(buf, dst_hi, dst_lo+skip, dst_size, 0, MFC_PUTB_CMD); + ptr = buf; + skip = 0; + + while ((src_size -= dst_size)) { + ptr += dst_size; + MATRIX_EA_UADD32(dst_hi, dst_lo, ld_dst); + dst_size = (src_size > (M_SUB*sizeof(double))) ? M_SUB*sizeof(double) : src_size; + + spu_mfcdma64(ptr, dst_hi, dst_lo, dst_size, 0, MFC_PUT_CMD); + } + MATRIX_EA_UADD32(dst_hi, dst_lo, ld_dst); + src_size = (left > 16*1024) ? 16*1024 : left; + dst_size = (src_size > (M_SUB*sizeof(double))) ? M_SUB*sizeof(double) : src_size; + } +} + + + +void accel_reform_rows_R_to_B(hpl_accel_init_parms_t *parms, + volatile hpl_accel_reform_rows_parms_t *cmd_parms) +{ + int i; + int m, n, ldr, lda; + int row; + unsigned int id; + unsigned int a_hi, a_lo, r_hi, r_lo; + unsigned int blk_col, skip, mask; + unsigned int spans, spans_per_spe, extra_spans, start_span, end_span; + unsigned int start_col, end_col, max_end_col; + unsigned int row_size; + vector signed int m_n_ldr_lda; + vector unsigned long long rows_a, incomplete_blk_col; + void *buf; + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the parameters + */ + m_n_ldr_lda = cmd_parms->m_n_ldr_lda; + rows_a = cmd_parms->rows_a; + incomplete_blk_col = cmd_parms->incomplete_blk_col; + + m = spu_extract(m_n_ldr_lda, 0); + n = spu_extract(m_n_ldr_lda, 1); + ldr = spu_extract(m_n_ldr_lda, 2); + lda = spu_extract(m_n_ldr_lda, 3); + + blk_col = spu_extract((vector unsigned int)incomplete_blk_col, 2); + + r_hi = spu_extract((vector unsigned int)rows_a, 0); + r_lo = spu_extract((vector unsigned int)rows_a, 1); + + a_hi = spu_extract((vector unsigned int)rows_a, 2); + a_lo = spu_extract((vector unsigned int)rows_a, 3); + + buf = bufA; + + skip = (blk_col % M_SUB) * sizeof(double); + + blk_col /= M_SUB; + MATRIX_EA_UMADD32(a_hi, a_lo, blk_col, lda); + + /* Equally assign complete rows to each of the SPEs. + */ + row_size = n*sizeof(double); + + /* Process remaining rows by assigning each row to groups of 4 SPEs. + * Compute the spanning parameters assigned to this SPE. + */ + spans = (row_size + skip + (M_SUB-1)*sizeof(double)) / (M_SUB * sizeof(double)); + spans_per_spe = spans / 8; + extra_spans = spans % 8; + + start_span = id * spans_per_spe + ((id > extra_spans) ? extra_spans : id); + end_span = start_span + spans_per_spe - spu_extract(spu_cmpgt(spu_promote(extra_spans, 0), spu_promote(id, 0)), 0); + + if (end_span > start_span) { + start_col = start_span * (M_SUB * sizeof(double)); + end_col = end_span * (M_SUB * sizeof(double)); + + max_end_col = skip + row_size; + + mask = spu_extract(spu_cmpeq(spu_promote(id, 0), 0), 0); + r_lo += start_col - (skip & ~mask); + MATRIX_EA_UMADD32(a_hi, a_lo, start_span, lda); + + skip &= mask; + + start_col += skip; + end_col = (end_col > max_end_col) ? max_end_col : end_col; + + row_size = end_col - start_col; + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + + for (i=0; iblk_rows[i]; + + hi = a_hi; + lo = a_lo; + EA_UADD64(hi, lo, (unsigned int)row >> (32-9), (unsigned int)row << 9); + row_R_to_B(r_hi, r_lo + (i*ldr), hi, lo, lda, skip, row_size, buf); +#else + row = cmd_parms->blk_rows[i]; + row_R_to_B(r_hi, r_lo + (i*ldr), a_hi, a_lo + (row * (M_SUB * sizeof(double))), lda, skip, row_size, buf); +#endif + } + } else { + DMA_WAIT_RECEIVE(); + } + + /* Report completion status if requested. + */ + report_completion(id, spu_extract(incomplete_blk_col, 0), 0); +} Index: accel/lib/spu/accel_spu.h =================================================================== RCS file: accel/lib/spu/accel_spu.h diff -N accel/lib/spu/accel_spu.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_spu.h 20 Aug 2008 03:57:53 -0000 1.7 @@ -0,0 +1,49 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#ifndef _ACCEL_SPU_H_ +#define _ACCEL_SPU_H_ 1 + +typedef void (*accel_specialist_t)(hpl_accel_init_parms_t *, volatile void *); + +/* Accellerator specialists and dispatch table + */ +extern void accel_dgemm(hpl_accel_init_parms_t *, volatile void *); +extern void accel_dgemm_C_C_C(hpl_accel_init_parms_t *, volatile void *); +extern void accel_dgemm_panel(hpl_accel_init_parms_t *, volatile void *); +extern void accel_dtrsm(hpl_accel_init_parms_t *, volatile void *); +extern void accel_dtrsm_panel(hpl_accel_init_parms_t *, volatile void *); +extern void accel_reform_matrix_CL_to_B(hpl_accel_init_parms_t *, volatile void *); +extern void accel_reform_panel_B_to_CL(hpl_accel_init_parms_t *, volatile void *); +extern void accel_reform_panel_R_to_B(hpl_accel_init_parms_t *, volatile void *); +extern void accel_reform_rows_R_to_B(hpl_accel_init_parms_t *, volatile void *); +extern void accel_reform_rows_B_to_R(hpl_accel_init_parms_t *, volatile void *); +extern void accel_fini(hpl_accel_init_parms_t *, volatile void *); +extern void accel_dtrsm_CL_B(hpl_accel_init_parms_t *, volatile void *); +extern void accel_swap_rows_B_to_B(hpl_accel_init_parms_t *, volatile void *); +extern void accel_copy_rows_R_to_R(hpl_accel_init_parms_t *, volatile void *); + + +accel_specialist_t dispatch[] = { + &accel_dgemm, + &accel_dtrsm, + &accel_reform_matrix_CL_to_B, + &accel_reform_panel_B_to_CL, + &accel_reform_panel_R_to_B, + &accel_dgemm_panel, + &accel_reform_rows_R_to_B, + &accel_reform_rows_B_to_R, + &accel_fini, + &accel_dtrsm_CL_B, + &accel_dtrsm_panel, + &accel_dgemm_C_C_C, + &accel_swap_rows_B_to_B, + &accel_copy_rows_R_to_R +}; + +#endif /* _ACCEL_SPU_H_ */ + + + Index: accel/lib/spu/accel_swap_rows_B_to_B.c =================================================================== RCS file: accel/lib/spu/accel_swap_rows_B_to_B.c diff -N accel/lib/spu/accel_swap_rows_B_to_B.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_swap_rows_B_to_B.c 20 Aug 2008 03:57:53 -0000 1.5 @@ -0,0 +1,186 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_reform.h" + + +static inline void row_B_to_B(unsigned int src_hi, unsigned int src_lo, + unsigned int dst_hi, unsigned int dst_lo, + int ld, unsigned int skip, unsigned int left) +{ + unsigned int size, blk_size; + + if (skip>0) { + size = (M_SUB*sizeof(double)) - skip; + if (size > left) size = left; + + spu_mfcdma64(bufA+skip, src_hi, src_lo+skip, size, 0, MFC_GET_CMD); + spu_mfcdma64(bufB+skip, dst_hi, dst_lo+skip, size, 0, MFC_GET_CMD); + spu_mfcdma64(bufA+skip, dst_hi, dst_lo+skip, size, 0, MFC_PUTB_CMD); + spu_mfcdma64(bufB+skip, src_hi, src_lo+skip, size, 0, MFC_PUT_CMD); + + MATRIX_EA_UADD32(src_hi, src_lo, ld); + MATRIX_EA_UADD32(dst_hi, dst_lo, ld); + left -= size; + } + + while (left) { + void *ptrA, *ptrB; + unsigned int get_size, put_size; + unsigned int save_src_hi = src_hi, save_src_lo = src_lo; + unsigned int save_dst_hi = dst_hi, save_dst_lo = dst_lo; + + /* size is the number of bytes swapped in this iteration of the loop */ + size = 16*1024; + if (size > left) size = left; + + /* Barrier to ensure all prior transfers are complete */ + spu_mfcdma64(0, 0, 0, 0, 0, MFC_BARRIER_CMD); + + /* Fetch (up to) 16KB of src and dst rows into separate buffers */ + ptrA = bufA; + ptrB = bufB; + get_size = size; + while (get_size) { + blk_size = (M_SUB*sizeof(double)); + if (blk_size > get_size) blk_size = get_size; + spu_mfcdma64(ptrA, src_hi, src_lo, blk_size, 0, MFC_GET_CMD); + spu_mfcdma64(ptrB, dst_hi, dst_lo, blk_size, 0, MFC_GET_CMD); + ptrA += blk_size; + ptrB += blk_size; + MATRIX_EA_UADD32(src_hi, src_lo, ld); + MATRIX_EA_UADD32(dst_hi, dst_lo, ld); + get_size -= blk_size; + } + + /* Barrier to ensure all gets are complete */ + spu_mfcdma64(0, 0, 0, 0, 0, MFC_BARRIER_CMD); + + src_hi = save_src_hi; src_lo = save_src_lo; + dst_hi = save_dst_hi; dst_lo = save_dst_lo; + + /* Store the fetched bytes back into the src and dst rows */ + ptrA = bufA; + ptrB = bufB; + put_size = size; + while (put_size) { + blk_size = (M_SUB*sizeof(double)); + if (blk_size > put_size) blk_size = put_size; + spu_mfcdma64(ptrB, src_hi, src_lo, blk_size, 0, MFC_PUT_CMD); + spu_mfcdma64(ptrA, dst_hi, dst_lo, blk_size, 0, MFC_PUT_CMD); + ptrA += blk_size; + ptrB += blk_size; + MATRIX_EA_UADD32(src_hi, src_lo, ld); + MATRIX_EA_UADD32(dst_hi, dst_lo, ld); + put_size -= blk_size; + } + + left -= size; + } + + DMA_WAIT(-1); +} + + +void accel_swap_rows_B_to_B(hpl_accel_init_parms_t *parms, + volatile hpl_accel_swap_rows_parms_t *cmd_parms) +{ + int m, n, lda; + int src, dst; + unsigned int id; + unsigned int a_hi, a_lo; + unsigned int blk_col, row_size, skip, mask; + unsigned int spans, spans_per_spe, extra_spans, start_span, end_span; + unsigned int start_col, end_col, max_end_col; + vector signed int m_n_lda_blk_col; + vector unsigned long long a_incomplete; + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the parameters + */ + m_n_lda_blk_col = cmd_parms->m_n_lda_blk_col; + a_incomplete = cmd_parms->a_incomplete; + + m = spu_extract(m_n_lda_blk_col, 0); + n = spu_extract(m_n_lda_blk_col, 1); + lda = spu_extract(m_n_lda_blk_col, 2); + blk_col = spu_extract(m_n_lda_blk_col, 3); + + a_hi = spu_extract((vector unsigned int)a_incomplete, 0); + a_lo = spu_extract((vector unsigned int)a_incomplete, 1); + + skip = (blk_col % M_SUB) * sizeof(double); + blk_col /= M_SUB; + + MATRIX_EA_UMADD32(a_hi, a_lo, blk_col, lda); + + /* Process rows by assigning each row to a group of 8 SPEs. + * Compute the spanning parameters assigned to this SPE. + */ + row_size = n*sizeof(double); + spans = (row_size + skip + (M_SUB-1)*sizeof(double)) / (M_SUB * sizeof(double)); + spans_per_spe = spans / 8; + extra_spans = spans % 8; + + start_span = id * spans_per_spe + ((id > extra_spans) ? extra_spans : id); + end_span = start_span + spans_per_spe - spu_extract(spu_cmpgt(spu_promote(extra_spans, 0), spu_promote(id, 0)), 0); + + if (end_span > start_span) { + start_col = start_span * (M_SUB * sizeof(double)); + end_col = end_span * (M_SUB * sizeof(double)); + + max_end_col = skip + row_size; + + mask = spu_extract(spu_cmpeq(spu_promote(id, 0), 0), 0); + MATRIX_EA_UMADD32(a_hi, a_lo, start_span, lda); + + skip &= mask; + + start_col += skip; + end_col = (end_col > max_end_col) ? max_end_col : end_col; + + row_size = end_col - start_col; + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + + for (src=0; srcblk_rows[src]; + if ( src != dst ) { +#ifdef MATRIX_4GB_CROSSING + unsigned int src_hi = a_hi, src_lo = a_lo, dst_hi = a_hi, dst_lo = a_lo; + EA_UADD64(src_hi, src_lo, (unsigned int)src >> (32-9), (unsigned int)src << 9); + EA_UADD64(dst_hi, dst_lo, (unsigned int)dst >> (32-9), (unsigned int)dst << 9); + row_B_to_B(src_hi, src_lo, dst_hi, dst_lo, lda, skip, row_size); +#else + row_B_to_B(a_hi, a_lo + (src * (M_SUB * sizeof(double))), + a_hi, a_lo + (dst * (M_SUB * sizeof(double))), + lda, skip, row_size); +#endif + } + } + } else { + DMA_WAIT_RECEIVE(); + } + + /* Report completion status if requested. + */ + report_completion(id, spu_extract(a_incomplete, 1), 0); +} Index: accel/lib/spu/accel_utils.h =================================================================== RCS file: accel/lib/spu/accel_utils.h diff -N accel/lib/spu/accel_utils.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_utils.h 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,173 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#ifndef _ACCEL_UTILS_H_ +#define _ACCEL_UTILS_H_ + +#include +#include + +#define LIKELY(COND) __builtin_expect(COND, 1) +#define UNLIKELY(COND) __builtin_expect(COND, 0) + +/* The waiting for DMA has been broken into two parts. + * 1) DMA_WAIT_REQUEST - Make a channel request for the wait by + * setting the tag mask and writing to the + * tag update channel. + * 2) DMA_WAIT_RECEIVE - Reading the tag status. + * + * The two parts needs to be seperated by 36 cycles to avoid + * stalling even when no DMAs are still in flight. + */ +#define DMA_WAIT_REQUEST(_mask) spu_writech(MFC_WrTagMask, _mask); \ + spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL); +#define DMA_WAIT_RECEIVE() (void)spu_readch(MFC_RdTagStat); + + +/* The simplified DMA_WAIT is used by the reformating routines since + * these are not computation bound and do not benefit in spliting + * the wait into two parts. + */ +#define DMA_WAIT(_mask) DMA_WAIT_REQUEST(_mask); \ + DMA_WAIT_RECEIVE(); + + + +/* Add the unsigned 32-bit _addend to the 64 bit effective address _eah,_eal. + */ +#define EA_UADD32(_eah, _eal, _addend) { \ + unsigned int _a; \ + \ + _a = _addend; \ + _eah += spu_extract(spu_genc(spu_promote(_eal, 0), \ + spu_promote(_a, 0)), 0); \ + _eal += _a; \ +} + +/* Add the signed 32-bit _addend to the 64 bit effective address _eah,_eal. + */ +#define EA_ADD32(_eah, _eal, _addend) { \ + vec_uint4 _va; \ + \ + _va = spu_promote((unsigned int)_addend, 0); \ + _eah = spu_extract(spu_addx(spu_promote(_eah, 0), \ + spu_rlmaska(_va, -31), \ + spu_genc(spu_promote(_eal, 0), _va)), 0); \ + _eal += spu_extract(_va, 0); \ +} + + + +/* Add the unsigned 64-bit addend specified by _ah,_al to the 64 bit effective + * address _eah,_eal. + */ +#define EA_UADD64(_eah, _eal, _ah, _al) { \ + vec_uint4 _vah, _val; \ + \ + _vah = spu_promote((unsigned int)_ah, 0); \ + _val = spu_promote((unsigned int)_al, 0); \ + _eah = spu_extract(spu_addx(spu_promote(_eah, 0), \ + _vah, \ + spu_genc(spu_promote(_eal, 0), _val)), 0);\ + _eal += spu_extract(_val, 0); \ +} + + + +/* Multiply two unsigned 32-bit values, _m1 and _m2, and return the 64-bit product + * in _ph,_pl. + */ +#define EA_UMUL32(_ph, _pl, _m1, _m2) \ +{ \ + vec_uint4 _vll, _vlh, _vhl, _vh, _vl, _v0, _v1; \ + vec_ushort8 _va, _vb, _vb2; \ + \ + _va = (vec_ushort8)spu_promote(_m1, 0); \ + _vb = (vec_ushort8)spu_promote(_m2, 0); \ + _vb2 = (vec_ushort8)spu_rl((vec_uint4)_vb, 16); \ + \ + _vll = spu_mulo(_va, _vb); \ + _vlh = spu_mulo(_va, _vb2); \ + _vhl = spu_mule(_va, _vb2); \ + \ + _vh = spu_mhhadd(_va, _vb, spu_add(spu_rlmask(_vhl, -16), spu_rlmask(_vlh, -16))); \ + \ + _v0 = spu_sl(_vhl, 16); \ + _v1 = spu_sl(_vlh, 16); \ + \ + _vh = spu_add(_vh, spu_genc(_v1, _v0)); \ + _vl = spu_add(_v1, _v0); \ + _vh = spu_add(_vh, spu_genc(_vl, _vll)); \ + _vl = spu_add(_vl, _vll); \ + \ + _ph = spu_extract(_vh, 0); \ + _pl = spu_extract(_vl, 0); \ +} + +/* Multiply two unsigned 32-bit values, _m1 and _m2, and add the 64-bit product to + * the 64-bit effective address. + */ +#define EA_UMADD32(_eah, _eal, _m1, _m2) { \ + unsigned int _ph, _pl; \ + EA_UMUL32(_ph, _pl, _m1, _m2); \ + EA_UADD64(_eah, _eal, _ph, _pl); \ +} + + +#ifdef PANEL_4GB_CROSSING +#define PANEL_EA_ADD32(_eah, _eal, _addend) EA_ADD32(_eah, _eal, _addend) +#define PANEL_EA_UADD32(_eah, _eal, _addend) EA_UADD32(_eah, _eal, _addend) +#else +#define PANEL_EA_ADD32(_eah, _eal, _addend) _eal += _addend; +#define PANEL_EA_UADD32(_eah, _eal, _addend) _eal += _addend; +#endif + +#ifdef MATRIX_4GB_CROSSING +#define MATRIX_EA_ADD32(_eah, _eal, _addend) EA_ADD32(_eah, _eal, _addend) +#define MATRIX_EA_UADD32(_eah, _eal, _addend) EA_UADD32(_eah, _eal, _addend) +#define MATRIX_EA_UMADD32(_eah, _eal, _m1, _m2) EA_UMADD32(_eah, _eal, _m1, _m2) +#else +#define MATRIX_EA_ADD32(_eah, _eal, _addend) _eal += _addend; +#define MATRIX_EA_UADD32(_eah, _eal, _addend) _eal += _addend; +#define MATRIX_EA_UMADD32(_eah, _eal, _m1, _m2) _eal += _m1 * _m2; +#endif + + +/* report_completion + * ----------------- + * Write a byte to system memory to report that the requested operation + * has been completed by the specified SPE. The DMA put is fenced using + * the specified tag ID so that the writeback is ordered with respect + * to the results posted to system memory. Caller's MUST ensure that the + * tag ID be the same as the DMA for the results. + */ +static vec_uchar16 completion_writeback = (vec_uchar16){0}; + +static inline void report_completion(int id, + unsigned long long incomplete_ea, + unsigned int tag) +{ + unsigned int incomplete_hi, incomplete_lo; + unsigned int size; + void *lsa; + + incomplete_lo = mfc_ea2l(incomplete_ea); + incomplete_hi = mfc_ea2h(incomplete_ea); + + size = 1 & ~(spu_extract(spu_cmpeq(spu_or(spu_promote(incomplete_hi, 0), + spu_promote(incomplete_lo, 0)), 0), 0)); + + incomplete_lo += id; + + lsa = ((void *)&completion_writeback) + (incomplete_lo & 0xF);; + + spu_mfcdma64(lsa, incomplete_hi, incomplete_lo, size, tag, MFC_PUTF_CMD); +} + + +#endif /* _ACCEL_UTILS_H_ */ + + + Index: accel/lib/spu/hpl_accel_spu.c =================================================================== RCS file: accel/lib/spu/hpl_accel_spu.c diff -N accel/lib/spu/hpl_accel_spu.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/hpl_accel_spu.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,60 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_utils.h" +#include "accel_spu.h" + +volatile hpl_accel_init_parms_t parms; + +volatile unsigned char cmd_parms[128] __attribute__ ((aligned (128))); + + +int main(unsigned long long speid __attribute__ ((unused)), + unsigned long long parms_ea) +{ + unsigned int cmd; + uint64_t cmd_queue; + + /* Fetch the global parameters + */ + + mfc_get(&parms, parms_ea, sizeof(parms), HPL_ACCEL_PARM_TAG, 0, 0); + DMA_WAIT(1 << HPL_ACCEL_PARM_TAG); + + while (1) { + cmd = spu_readch(SPU_RdInMbox); + + /* Fetch the command parameters + */ + cmd_queue = parms.cmd_base + (cmd & ~HPL_ACCEL_CMD_MASK); + + mfc_get((volatile void *)cmd_parms, cmd_queue, 128, HPL_ACCEL_PARM_TAG, 0, 0); + + DMA_WAIT_REQUEST(1< +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +int norepeat_rand_row(int max, int *rows, int cnt) +{ + int i; + int new_row; + int unique; + + do { + new_row = (max * (rand() & 0xFFFF)) >> 16; + unique = 1; + + for (i=0; i=0; i--) { + if (B1[i] != B2[i]) { + errors++; + if (errors < 20) printf("B1<->B2 %d expected=%f got=%f\n", i, B1[i], B2[i]); + } + } + + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/dgemm.c =================================================================== RCS file: accel/lib/tests/dgemm.c diff -N accel/lib/tests/dgemm.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/dgemm.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,113 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + + +#define EPSILON 0.0000001 + + +/* dgemm + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int lda = 0; + int ldb = 0; + int ldc = 0; + int k=128; + int n=128; + int m=128; + volatile unsigned long long incomplete; + double *A, *B, *C1, *C2; + + switch (argc) { + case 6: + ldc = atoi(argv[5]); + case 5: + ldb = atoi(argv[4]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 1280; + n = rand() % 1280; + lda = rand() % 2048; + ldb = rand() % 2048; + ldc = rand() % (64*2048); + break; + default: + printf("Usage: %s [m [n [lda [ldb [ldc]]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + m = m & ~(64-1); + if (m < 64) m = 64; + + n = n & ~(64-1); + if (n < 64) n = 64; + + lda = lda & ~(15); + if (lda < m) lda = m; + + ldb = ldb & ~(15); + if (ldb < n) ldb = n; + + ldc = ldc & ~(15); + if (ldc < 64*m) ldc = 64*m; + + printf("Performing dgemm test with m=%d n=%d k=%d lda=%d ldb=%d ldc=%d\n", m, n, k, lda, ldb, ldc); + + /* Allocate and initialize the arrays + */ + A = (double *)allocate_panel(k, lda, 128); + B = (double *)allocate_panel(k, ldb, 128); + C1 = (double *)allocate_matrix(n/64, ldc, 128); + C2 = (double *)allocate_matrix(n/64, ldc, 128); + + for (i=0; i=0; i--) { + if (fabs(C1[i] - C2[i]) > EPSILON) { + errors++; + printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]); + } + } + printf("Errors = %d\n", errors); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/dgemm_CL_B_B.c =================================================================== RCS file: accel/lib/tests/dgemm_CL_B_B.c diff -N accel/lib/tests/dgemm_CL_B_B.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/dgemm_CL_B_B.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,113 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + + +#define EPSILON 0.0000001 + + +/* dgemm + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int lda = 0; + int ldb = 0; + int ldc = 0; + int k=128; + int n=128; + int m=128; + volatile unsigned long long incomplete; + double *A, *B, *C1, *C2; + + switch (argc) { + case 6: + ldc = atoi(argv[5]); + case 5: + ldb = atoi(argv[4]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 1280; + n = rand() % 1280; + lda = rand() % 2048; + ldb = rand() % 16384; + ldc = rand() % (64*2048); + break; + default: + printf("Usage: %s [m [n [lda [ldb [ldc]]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + m = m & ~(64-1); + if (m < 64) m = 64; + + n = n & ~(64-1); + if (n < 64) n = 64; + + lda = lda & ~(15); + if (lda < m) lda = m; + + ldb = ldb & ~(15); + if (ldb < 64*k) ldb = 64*k; + + ldc = ldc & ~(15); + if (ldc < 64*m) ldc = 64*m; + + printf("Performing dgemm test with m=%d n=%d k=%d lda=%d ldb=%d ldc=%d\n", m, n, k, lda, ldb, ldc); + + /* Allocate and initialize the arrays + */ + A = (double *)allocate_panel(k, lda, 128); + B = (double *)allocate_matrix(n/64, ldb, 128); + C1 = (double *)allocate_matrix(n/64, ldc, 128); + C2 = (double *)allocate_matrix(n/64, ldc, 128); + + for (i=0; i=0; i--) { + if (fabs(C1[i] - C2[i]) > EPSILON) { + errors++; + printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]); + } + } + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/dgemm_CL_B_B_CL.c =================================================================== RCS file: accel/lib/tests/dgemm_CL_B_B_CL.c diff -N accel/lib/tests/dgemm_CL_B_B_CL.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/dgemm_CL_B_B_CL.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,179 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + + +#define EPSILON 0.0000001 + +unsigned long long dab(double d) +{ + union { + unsigned long long ull; + double d; + } x; + x.d = d; + return (x.ull); +} + + +/* dgemm + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int lda = 0; + int ldb = 0; + int ldc = 0; + int ldp = 0; + int c_col = 0; + int c_row = 0; + int c_cols, c_rows; + int k=128; + int n=128; + int m=128; + int csize, psize, bsize; + volatile unsigned long long incomplete; + double *A, *B, *C1, *C2; + double *P1 = NULL; + double *P2 = NULL; + + switch (argc) { + case 9: + c_col = atoi(argv[8]); + case 8: + c_row = atoi(argv[7]); + case 7: + ldp = atoi(argv[6]); + case 6: + ldc = atoi(argv[5]); + case 5: + ldb = atoi(argv[4]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 1024; + n = rand() % 1024; + lda = rand() % 1200; + ldb = rand() % 1200; + ldc = rand() % 1200; + if (rand() & 1) ldp = rand() % 1200; + c_row = rand() % 150; + c_col = rand() % 150; + + if (rand() & 1) { + lda &= ~1; + ldb &= ~1; + ldc &= ~1; + ldp &= ~1; + c_row = 0; + c_col = 0; + } + + break; + default: + printf("Usage: %s [m [n [lda [ldb [ldc [ldp [c_col [c_row]]]]]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + if (lda < m) lda = m; + lda = (lda + 15) & ~(15); + + if (ldb < M_SUB*k) ldb = M_SUB*k; + ldb = (ldb + 15) & ~(15); + + if (ldc < m) ldc = m; + ldc = (ldc + 15) & ~(15); + + if (ldp) { + if (ldp < m) ldp = m; + ldp = (ldp + 15) & ~(15); + } + + c_cols = c_col + n; + c_rows = c_row + m; + + if (ldc < c_rows*64) ldc = c_rows*64; + ldc = ((ldc + 63) & ~63); + c_cols = (c_cols + 63) & ~63; + + csize = ldc*c_cols/64; + psize = ldp*n; + + bsize = ldb*(n+M_SUB-1)/M_SUB; + + printf("Performing dgemm test with m=%d n=%d lda=%d ldb=%d ldc=%d ldp=%d c_row=%d c_col=%d\n", m, n, lda, ldb, ldc, ldp, c_row, c_col); + + /* Allocate and initialize the arrays + */ + A = (double *)allocate_panel(k, lda, 128); + B = (double *)allocate_matrix(n+63, ldb, 128); + if (ldp) { + C1 = C2 = (double *)allocate_matrix(c_cols, ldc, 128); + P1 = (double *)allocate_panel(n, ldp, 128); + P2 = (double *)allocate_panel(n, ldp, 128); + for (i=0; i=0; i--) { + double p1, p2; + + p1 = byte_swap(P1[i]); + p2 = byte_swap(P2[i]); + + if (fabs(p1 - p2) > EPSILON) { + if (errors++ < 20) printf(" %d expected=%f got=%f\n", i, p1, p2); + } + } + } else { + for (i=csize, errors=0; i>=0; i--) { + if (fabs(C1[i] - C2[i]) > EPSILON) { + if (errors++ < 20) printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]); + } + } + } + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/dgemm_CL_C_C.c =================================================================== RCS file: accel/lib/tests/dgemm_CL_C_C.c diff -N accel/lib/tests/dgemm_CL_C_C.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/dgemm_CL_C_C.c 14 May 2008 21:35:01 -0000 1.3 @@ -0,0 +1,126 @@ +/* ------------------------------------------------------------------ */ +/* (C) Copyright 2007 */ +/* International Business Machines Corporation, */ +/* */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------ */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + + +#define EPSILON 0.0000001 + + +/* dgemm_CL_C_C + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int lda = 0; + int ldb = 0; + int ldc = 0; + int k=64; + int n=64; + int m=128; + volatile unsigned long long incomplete; + double *A, *B, *C1, *C2; + + switch (argc) { + case 7: + ldc = atoi(argv[6]); + case 6: + ldb = atoi(argv[5]); + case 5: + lda = atoi(argv[4]); + case 4: + k = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 2000; + n = rand() % 70; + k = rand() % 70; + lda = rand() % 2000; + ldb = rand() % 200; + ldc = rand() % 2000; + + /* Force all parameter within constraints */ + if ((rand() & 1) == 0) { + k &= ~(4-1); + m &= ~(8-1); + n &= ~(4-1); + + if (k < 4) k = 4; + if (k > 64) k = 64; + if (m < 8) m = 8; + if (n < 4) n = 4; + if (n > 64) n = 64; + + lda &= ~1; + ldb &= ~1; + ldc &= ~1; + } + break; + default: + printf("Usage: %s [m [n [k [lda [ldb [ldc]]]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + if (m == 0) m = 1; + if (n == 0) n = 1; + if (k == 0) k = 1; + + if (lda < m) lda = m; + if (ldb < k) ldb = k; + if (ldc < m) ldc = m; + + printf("Performing dgemm_CL_C_C test with m=%d n=%d k=%d lda=%d ldb=%d ldc=%d\n", m, n, k, lda, ldb, ldc); + + /* Allocate and initialize the arrays + */ + A = (double *)allocate_panel(k, lda, 128); + B = (double *)allocate_panel(n, ldb, 128); + C1 = (double *)allocate_panel(n, ldc, 128); + C2 = (double *)allocate_panel(n, ldc, 128); + + for (i=0; i=0; i--) { + if (fabs(C1[i] - C2[i]) > EPSILON) { + errors++; + printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]); + } + } + printf("Errors = %d\n", errors); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/dgemm_CL_R_B_CL.c =================================================================== RCS file: accel/lib/tests/dgemm_CL_R_B_CL.c diff -N accel/lib/tests/dgemm_CL_R_B_CL.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/dgemm_CL_R_B_CL.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,177 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + + +#define EPSILON 0.0000001 + +unsigned long long dab(double d) +{ + union { + unsigned long long ull; + double d; + } x; + x.d = d; + return (x.ull); +} + + +/* dgemm + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int lda = 0; + int ldb = 0; + int ldc = 0; + int ldp = 0; + int c_col = 0; + int c_row = 0; + int c_cols, c_rows; + int k=128; + int n=128; + int m=128; + int csize, psize; + volatile unsigned long long incomplete; + double *A, *B, *C1, *C2; + double *P1 = NULL; + double *P2 = NULL; + + switch (argc) { + case 9: + c_col = atoi(argv[8]); + case 8: + c_row = atoi(argv[7]); + case 7: + ldp = atoi(argv[6]); + case 6: + ldc = atoi(argv[5]); + case 5: + ldb = atoi(argv[4]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 1024; + n = rand() % 1024; + lda = rand() % 1536; + ldb = rand() % 1536; + ldc = rand() % 1536; + if (rand() & 1) ldp = rand() % 1536; + c_row = rand() % 256; + c_col = rand() % 256; + + if (rand() & 1) { + lda &= ~1; + ldb &= ~1; + ldc &= ~1; + ldp &= ~1; + c_row = 0; + c_col = 0; + } + + break; + default: + printf("Usage: %s [m [n [lda [ldb [ldc [ldp [c_col [c_row]]]]]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + if (lda < m) lda = m; + lda = (lda + 15) & ~(15); + + if (ldb < n) ldb = n; + ldb = (ldb + 15) & ~(15); + + if (ldc < m) ldc = m; + ldc = (ldc + 15) & ~(15); + + if (ldp) { + if (ldp < m) ldp = m; + ldp = (ldp + 15) & ~(15); + } + + c_cols = c_col + n; + c_rows = c_row + m; + + if (ldc < c_rows*64) ldc = c_rows*64; + ldc = ((ldc + 63) & ~63); + c_cols = (c_cols + 63) & ~63; + + csize = ldc*c_cols/64; + psize = ldp*n; + + printf("Performing dgemm test with m=%d n=%d lda=%d ldb=%d ldc=%d ldp=%d c_row=%d c_col=%d\n", m, n, lda, ldb, ldc, ldp, c_row, c_col); + + /* Allocate and initialize the arrays + */ + A = (double *)allocate_panel(k, lda, 128); + B = (double *)allocate_panel(k, ldb, 128); + if (ldp) { + C1 = C2 = (double *)allocate_matrix(c_cols, ldc, 128); + P1 = (double *)allocate_panel(n, ldp, 128); + P2 = (double *)allocate_panel(n, ldp, 128); + for (i=0; i=0; i--) { + double p1, p2; + + p1 = byte_swap(P1[i]); + p2 = byte_swap(P2[i]); + + if (fabs(p1 - p2) > EPSILON) { + if (errors++ < 20) printf(" %d expected=%f got=%f\n", i, p1, p2); + } + } + } else { + for (i=csize, errors=0; i>=0; i--) { + if (fabs(C1[i] - C2[i]) > EPSILON) { + if (errors++ < 20) printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]); + } + } + } + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/dgemm_C_C_C.c =================================================================== RCS file: accel/lib/tests/dgemm_C_C_C.c diff -N accel/lib/tests/dgemm_C_C_C.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/dgemm_C_C_C.c 20 Aug 2008 03:57:53 -0000 1.4 @@ -0,0 +1,126 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + + +#define EPSILON 0.0000001 + + +/* dgemm_C_C_C + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int lda = 0; + int ldb = 0; + int ldc = 0; + int k=64; + int n=64; + int m=128; + volatile unsigned long long incomplete; + double *A, *B, *C1, *C2; + + switch (argc) { + case 7: + ldc = atoi(argv[6]); + case 6: + ldb = atoi(argv[5]); + case 5: + lda = atoi(argv[4]); + case 4: + k = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 2000; + n = rand() % 70; + k = rand() % 70; + lda = rand() % 2000; + ldb = rand() % 200; + ldc = rand() % 2000; + + /* Force all parameter within constraints */ + if ((rand() & 1) == 0) { + k &= ~(4-1); + m &= ~(8-1); + n &= ~(4-1); + + if (k < 4) k = 4; + if (k > 64) k = 64; + if (m < 8) m = 8; + if (n < 4) n = 4; + if (n > 64) n = 64; + + lda &= ~1; + ldb &= ~1; + ldc &= ~1; + } + break; + default: + printf("Usage: %s [m [n [k [lda [ldb [ldc]]]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + if (m == 0) m = 1; + if (n == 0) n = 1; + if (k == 0) k = 1; + + if (lda < m) lda = m; + if (ldb < k) ldb = k; + if (ldc < m) ldc = m; + + printf("Performing dgemm_C_C_C test with m=%d n=%d k=%d lda=%d ldb=%d ldc=%d\n", m, n, k, lda, ldb, ldc); + + /* Allocate and initialize the arrays + */ + A = (double *)allocate_panel(k, lda, 128); + B = (double *)allocate_panel(n, ldb, 128); + C1 = (double *)allocate_panel(n, ldc, 128); + C2 = (double *)allocate_panel(n, ldc, 128); + + for (i=0; i=0; i--) { + if (fabs(C1[i] - C2[i]) > EPSILON) { + errors++; + printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]); + } + } + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/dtrsm.c =================================================================== RCS file: accel/lib/tests/dtrsm.c diff -N accel/lib/tests/dtrsm.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/dtrsm.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,147 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +#define EPSILON 0.0000001 + + +/* dtrsm + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int lda = 0; + int ldb = 0; + int ldc = 0; + int n=128; + int n_padded, nb, m_padded; + int m=128; + unsigned int blk_col = 0; + unsigned int blk_row = 0; + volatile unsigned long long incomplete; + double *A, *B1, *B2, *C1, *C2; + + switch (argc) { + case 8: + blk_col = atoi(argv[7]); + case 7: + blk_row = atoi(argv[6]); + case 6: + ldc = atoi(argv[5]); + case 5: + ldb = atoi(argv[4]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = ((rand() & 3) == 0) ? (rand() % 1024) : 128; + n = rand() % 1024; + lda = rand() % 1536; + ldb = rand() % 1536; + ldc = ((rand() & 3) == 0) ? (rand() % 1536) : 0; + if ((rand() & 7) == 0) { + blk_row = rand() & 127; + blk_col = rand() & 127; + } + break; + default: + printf("Usage: %s [m [n [lda [ldb [ldc [blk_row [blk_col]]]]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + if ((rand() & 7) != 0) lda &= ~1; + if (lda < m) lda = m; + + if ((rand() & 7) != 0) ldb &= ~1; + if (ldb < n) ldb = n; + + + if (ldc) { + ldc = (ldc + 1) & ~(1); + m_padded = blk_row + m; + if (ldc < 64*m_padded) ldc = 64*m_padded; + } else { + blk_row = blk_col = 0; + } + + n_padded = (n + blk_col + 63) & ~63; + + printf("Performing dtrsm test with m=%d n=%d lda=%d ldb=%d ldc=%d blk_row=%d blk_col=%d\n", m, n, lda, ldb, ldc, blk_row, blk_col); + + /* Allocate and initialize the arrays + */ + + hpl_ref_init(); + hpl_accel_init(); + + /* First test the DRTSM without copy into the C matrix. + */ + A = (double *)allocate_panel(m, lda, 128); + B1 = (double *)allocate_panel(m, ldb, 128); + + for (i=0; i=0; i--) { + if (fabs(C1[i] - C2[i]) > EPSILON) { + errors++; + if (errors < 20) printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]); + } + } + printf("Errors (with copy) = %d\n", errors); + } else { + for (i=ldb*m-1, errors=0; i>=0; i--) { + if (fabs(B1[i] - B2[i]) > EPSILON) { + errors++; + if (errors < 20) printf(" %d expected=%f got=%f\n", i, B1[i], B2[i]); + } + } + printf("Errors (without copy) = %d\n", errors); + if (errors) return 1; + } + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/dtrsm_CL_B.c =================================================================== RCS file: accel/lib/tests/dtrsm_CL_B.c diff -N accel/lib/tests/dtrsm_CL_B.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/dtrsm_CL_B.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,121 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +#define EPSILON 0.0000001 + + +/* dtrsm + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int lda = 0; + int ldb = 0; + int n=128; + int n_padded, nb, m_padded; + int m=128; + unsigned int blk_col = 0; + unsigned int blk_row = 0; + volatile unsigned long long incomplete; + double *A, *B1, *B2; + + switch (argc) { + case 7: + blk_col = atoi(argv[6]); + case 6: + blk_row = atoi(argv[5]); + case 5: + ldb = atoi(argv[4]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = ((rand() & 3) == 0) ? (rand() % 1024) : 128; + n = rand() % 1024; + lda = rand() % 1536; + ldb = rand() % 1536; + if ((rand() & 7) == 0) { + blk_row = rand() & 127; + blk_col = rand() & 127; + } + break; + default: + printf("Usage: %s [m [n [lda [ldb [blk_row [blk_col]]]]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + if ((rand() & 7) != 0) lda &= ~1; + if (lda < m) lda = m; + + if ((rand() & 7) != 0) ldb &= ~1; + if (ldb < n) ldb = n; + + + ldb = (ldb + 1) & ~(1); + m_padded = (m + blk_row + 63) & ~63; + + if (ldb < 64*m_padded) ldb = 64*m_padded; + + n_padded = (n + blk_col + 63) & ~63; + + printf("Performing dtrsm_CL_B test with m=%d n=%d lda=%d ldb=%d blk_row=%d blk_col=%d\n", m, n, lda, ldb, blk_row, blk_col); + + /* Allocate and initialize the arrays + */ + + hpl_ref_init(); + hpl_accel_init(); + + /* First test the DRTSM without copy into the C matrix. + */ + A = (double *)allocate_panel(m, lda, 128); + + for (i=0; i=0; i--) { + if (fabs(B1[i] - B2[i]) > EPSILON) { + errors++; + if (errors < 20) printf(" %d expected=%f got=%f\n", i, B1[i], B2[i]); + } + } + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/perf_dgemm.c =================================================================== RCS file: accel/lib/tests/perf_dgemm.c diff -N accel/lib/tests/perf_dgemm.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/perf_dgemm.c 20 Aug 2008 03:57:53 -0000 1.2 @@ -0,0 +1,182 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" +#include +#include +#include +#include +#include + +/* dgemm performance test + */ +int main(int argc, char *argv[]) +{ + int i; + int m=64; + int n=64; + int k=128; + int m_padded, n_padded; + int iterations = 1; + int ldp = 0; + volatile unsigned long long incomplete; + void *ptr; + char *env; + double *A, *B, *C, *P = NULL; + double tbfreq, gflops; + unsigned long long *ticks; + unsigned long long total; + double min, max, mean, std, delta; + + switch (argc) { + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + case 1: + break; + default: + printf("Usage: %s [n]\n", argv[0]); + return 1; + break; + } + + m_padded = (m + 63) & ~63; + n_padded = (n + 63) & ~63; + + if ((env = getenv("ITERATIONS"))) + iterations = atoi(env); + ticks = (unsigned long long *)malloc(iterations * sizeof(unsigned long long)); + + /* Allocate and initialize the arrays + */ + if (getenv("HUGE_TLBFS")) { + size_t memsize = 4*128 + (128*(m_padded+n_padded) + ((size_t)m_padded*(size_t)n_padded)) * sizeof(double); + size_t hugepagesize = 16*1024*1024; + int fd; + void *mem = NULL; + char filename[100]; + + if (getenv("PANEL")) { + memsize += 128 + ((size_t)m_padded * (size_t)n_padded)*sizeof(double); + } + + sprintf(filename, "/huge/perf_dgemm_%d.dat", getpid()); + + if ((fd = open (filename, O_CREAT | O_RDWR, 0755)) == -1) { + printf("open for huge page file %s failed (errno=%d): %s\n", filename, errno, strerror(errno)); + exit(1); + } else { + /* Delete file so that huge pages will get freed on program termination. */ + remove(filename); + + memsize = ( memsize + hugepagesize-1) & ~(hugepagesize-1); + + mem = mmap((void *)(0x100000000ULL), memsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + printf("mmap for %lld bytes in huge page file %s failed (errno=%d): %s\n", + (unsigned long long)memsize, filename, errno, strerror(errno)); + exit(1); + } + + A = (double *)ALIGN128(mem); + B = (double *)ALIGN128(A + 128*m_padded); + C = (double *)ALIGN128(B + 128*n_padded); + if (getenv("PANEL")) { + ldp = m_padded; + P = (double *)ALIGN128(C + m_padded*n_padded); + } + + /* Closing the file descriptor does not unmap the region, so let's just take care of that right away */ + close (fd); + } + } else { + if (posix_memalign(&ptr, 128, 128*m_padded*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + A = (double *)ptr; + } + if (posix_memalign(&ptr, 128, 128*n_padded*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + B = (double *)ptr; + } + if (posix_memalign(&ptr, 128, m_padded*n_padded*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + C = (double *)ptr; + } + if (getenv("PANEL")) { + ldp = m_padded; + if (posix_memalign(&ptr, 128, m_padded*n_padded*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + P = (double *)ptr; + } + } + } + + for (i=0; i<128*m_padded; i++) { + A[i] = 0.0f; + __dcbf(&A[i]); + } + for (i=0; i<128*n_padded; i++) { + B[i] = 0.0f; + __dcbf(&B[i]); + } + for (i=0; i max) max = ticks[i]; + } + std = sqrt(std/(double)(iterations)); + + + printf("DGEMM m=%d n=%d MIN=%f MAX=%f MEAN=%f ticks Std Dev=%f Variance=%f%% %f Gflops/sec\n", m, n, + min, max, mean, std, 100.0*((double)(max-min))/((double)mean), gflops); + + + return 0; +} Index: accel/lib/tests/perf_dgemm_C.c =================================================================== RCS file: accel/lib/tests/perf_dgemm_C.c diff -N accel/lib/tests/perf_dgemm_C.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/perf_dgemm_C.c 20 Aug 2008 03:57:53 -0000 1.2 @@ -0,0 +1,161 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" +#include +#include +#include +#include +#include + +/* dgemm_C_C_C performance test + */ +int main(int argc, char *argv[]) +{ + int i; + int m=64; + int n=64; + int k=64; + int m_padded, k_padded; + int iterations = 1; + volatile unsigned long long incomplete; + void *ptr; + char *env; + double *A, *B, *C; + double tbfreq, gflops; + unsigned long long *ticks; + unsigned long long total; + double min, max, mean, std, delta; + + switch (argc) { + case 4: + k = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + case 1: + break; + default: + printf("Usage: %s [m [n [k]]]\n", argv[0]); + return 1; + break; + } + + k_padded = (k + 15) & ~15; + m_padded = (m + 15) & ~15; + + if ((env = getenv("ITERATIONS"))) + iterations = atoi(env); + ticks = (unsigned long long *)malloc(iterations * sizeof(unsigned long long)); + + /* Allocate and initialize the arrays + */ + if (getenv("HUGE_TLBFS")) { + size_t memsize = (m_padded*(k+n) + k_padded*n) * sizeof(double); + size_t hugepagesize = 16*1024*1024; + int fd; + void *mem = NULL; + char filename[100]; + + sprintf(filename, "/huge/perf_dgemm_C_%d.dat", getpid()); + + if ((fd = open (filename, O_CREAT | O_RDWR, 0755)) == -1) { + printf("open for huge page file %s failed (errno=%d): %s\n", filename, errno, strerror(errno)); + exit(1); + } else { + /* Delete file so that huge pages will get freed on program termination. */ + remove(filename); + + memsize = ( memsize + hugepagesize-1) & ~(hugepagesize-1); + + mem = mmap((void *)(0x100000000ULL), memsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + printf("mmap for %lld bytes in huge page file %s failed (errno=%d): %s\n", + (unsigned long long)memsize, filename, errno, strerror(errno)); + exit(1); + } + + A = (double *)ALIGN128(mem); + B = (double *)ALIGN128(A + m_padded*k); + C = (double *)ALIGN128(B + k_padded*n); + + /* Closing the file descriptor does not unmap the region, so let's just take care of that right away */ + close (fd); + } + } else { + if (posix_memalign(&ptr, 128, m_padded*k*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + A = (double *)ptr; + } + if (posix_memalign(&ptr, 128, k_padded*n*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + B = (double *)ptr; + } + if (posix_memalign(&ptr, 128, m_padded*n*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + C = (double *)ptr; + } + } + + for (i=0; i max) max = ticks[i]; + } + std = sqrt(std/(double)(iterations)); + + + printf("DGEMM_CL m=%d n=%d k=%d MIN=%f MAX=%f MEAN=%f ticks Std Dev=%f Variance=%f%% %f Gflops/sec\n", m, n, k, + min, max, mean, std, 100.0*((double)(max-min))/((double)mean), gflops); + + + return 0; +} Index: accel/lib/tests/perf_dgemm_CL.c =================================================================== RCS file: accel/lib/tests/perf_dgemm_CL.c diff -N accel/lib/tests/perf_dgemm_CL.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/perf_dgemm_CL.c 14 May 2008 21:35:01 -0000 1.2 @@ -0,0 +1,163 @@ +/* ------------------------------------------------------------------ */ +/* (C) Copyright 2007 */ +/* International Business Machines Corporation, */ +/* */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------ */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" +#include +#include +#include +#include +#include + +/* dgemm performance test + */ +int main(int argc, char *argv[]) +{ + int i; + int m=64; + int n=64; + int k=64; + int m_padded, k_padded; + int iterations = 1; + volatile unsigned long long incomplete; + void *ptr; + char *env; + double *A, *B, *C; + double tbfreq, gflops; + unsigned long long *ticks; + unsigned long long total; + double min, max, mean, std, delta; + + switch (argc) { + case 4: + k = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + case 1: + break; + default: + printf("Usage: %s [m [n [k]]]\n", argv[0]); + return 1; + break; + } + + k_padded = (k + 15) & ~15; + m_padded = (m + 15) & ~15; + + if ((env = getenv("ITERATIONS"))) + iterations = atoi(env); + ticks = (unsigned long long *)malloc(iterations * sizeof(unsigned long long)); + + /* Allocate and initialize the arrays + */ + if (getenv("HUGE_TLBFS")) { + size_t memsize = (m_padded*(k+n) + k_padded*n) * sizeof(double); + size_t hugepagesize = 16*1024*1024; + int fd; + void *mem = NULL; + char filename[100]; + + sprintf(filename, "/huge/perf_dgemm_CL_%d.dat", getpid()); + + if ((fd = open (filename, O_CREAT | O_RDWR, 0755)) == -1) { + printf("open for huge page file %s failed (errno=%d): %s\n", filename, errno, strerror(errno)); + exit(1); + } else { + /* Delete file so that huge pages will get freed on program termination. */ + remove(filename); + + memsize = ( memsize + hugepagesize-1) & ~(hugepagesize-1); + + mem = mmap((void *)(0x100000000ULL), memsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + printf("mmap for %lld bytes in huge page file %s failed (errno=%d): %s\n", + (unsigned long long)memsize, filename, errno, strerror(errno)); + exit(1); + } + + A = (double *)ALIGN128(mem); + B = (double *)ALIGN128(A + m_padded*k); + C = (double *)ALIGN128(B + k_padded*n); + + /* Closing the file descriptor does not unmap the region, so let's just take care of that right away */ + close (fd); + } + } else { + if (posix_memalign(&ptr, 128, m_padded*k*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + A = (double *)ptr; + } + if (posix_memalign(&ptr, 128, k_padded*n*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + B = (double *)ptr; + } + if (posix_memalign(&ptr, 128, m_padded*n*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + C = (double *)ptr; + } + } + + for (i=0; i max) max = ticks[i]; + } + std = sqrt(std/(double)(iterations)); + + + printf("DGEMM_CL m=%d n=%d k=%d MIN=%f MAX=%f MEAN=%f ticks Std Dev=%f Variance=%f%% %f Gflops/sec\n", m, n, k, + min, max, mean, std, 100.0*((double)(max-min))/((double)mean), gflops); + + + return 0; +} Index: accel/lib/tests/perf_dtrsm.c =================================================================== RCS file: accel/lib/tests/perf_dtrsm.c diff -N accel/lib/tests/perf_dtrsm.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/perf_dtrsm.c 20 Aug 2008 03:57:53 -0000 1.2 @@ -0,0 +1,159 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" +#include +#include +#include +#include +#include + + +/* dtrsm performance test + */ +int main(int argc, char *argv[]) +{ + int i; + int m=128; + int n=128; + int n_padded; + volatile unsigned long long incomplete; + void *ptr; + double *A, *B, *C; + double ops, bytes, tbfreq, gflops, grate; + int iterations = 1; + char *env; + unsigned long long *ticks; + unsigned long long total; + double min, max, mean, std, delta; + + switch (argc) { + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + case 1: + break; + default: + printf("Usage: %s [m [n]]\n", argv[0]); + return 1; + break; + } + + if ((env = getenv("ITERATIONS"))) + iterations = atoi(env); + ticks = (unsigned long long *)malloc(iterations * sizeof(unsigned long long)); + + /* Allocate and initialize the arrays + */ + n_padded = (n | 128) & ~(128-1); + if (getenv("HUGE_TLBFS")) { + size_t memsize = (m*m + m*n_padded + m*n)*sizeof(double) + 3*128; + size_t hugepagesize = 16*1024*1024; + int fd; + void *mem = NULL; + char filename[100]; + + sprintf(filename, "/huge/perf_dtrsm_%d.dat", getpid()); + + if ((fd = open (filename, O_CREAT | O_RDWR, 0755)) == -1) { + printf("open for huge page file %s failed (errno=%d): %s\n", filename, errno, strerror(errno)); + exit(1); + } else { + /* Delete file so that huge pages will get freed on program termination. */ + remove(filename); + + memsize = ( memsize + hugepagesize-1) & ~(hugepagesize-1); + + mem = mmap((void *)(0x100000000ULL), memsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + printf("mmap for %lld bytes in huge page file %s failed (errno=%d): %s\n", + (unsigned long long)memsize, filename, errno, strerror(errno)); + exit(1); + } + + A = (double *)ALIGN128(mem); + B = (double *)ALIGN128(A + m*m); + C = (double *)ALIGN128(B + m*n_padded); + + /* Closing the file descriptor does not unmap the region, so let's just take care of that right away */ + close (fd); + } + } else { + if (posix_memalign(&ptr, 128, m*m*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + A = (double *)ptr; + } + + /* Pad n to an off multiple of 128 for bank utilization performance reasons. + */ + if (posix_memalign(&ptr, 128, m*n_padded*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + B = (double *)ptr; + } + if (posix_memalign(&ptr, 128, m*n*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + C = (double *)ptr; + } + } + + for (i=0; i max) max = ticks[i]; + } + std = sqrt(std/(double)(iterations)); + + printf("m=%d n=%d MIN=%f MAX=%f MEAN=%f ticks Std Dev=%f Variance=%f%% %f Gflops/sec %f Gbytes/sec\n", + m, n, min, max, mean, std, + 100.0*((double)(max-min))/((double)mean), + gflops, grate); + + return 0; +} Index: accel/lib/tests/perf_reform_lpanel.c =================================================================== RCS file: accel/lib/tests/perf_reform_lpanel.c diff -N accel/lib/tests/perf_reform_lpanel.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/perf_reform_lpanel.c 20 Aug 2008 03:57:53 -0000 1.2 @@ -0,0 +1,86 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +/* reform l panel performance test + */ +int main(int argc, char *argv[]) +{ + int i; + int m=64; + int n=128; + int n_padded; + volatile unsigned long long incomplete; + void *ptr; + double *A, *panel; + double tbfreq; + unsigned long long ticks; + + + switch (argc) { + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + break; + default: + printf("Usage: %s [m [n]]\n", argv[0]); + return 1; + break; + } + + m = m & ~(64-1); + if (m < 64) m = 64; + + if (n < 1) n = 1; + n_padded = (n + 63) & ~(63); + + /* Allocate and initialize the arrays + */ + if (posix_memalign(&ptr, 128, m*n_padded*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + A = (double *)ptr; + } + + if (posix_memalign(&ptr, 128, m*n*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + panel = (double *)ptr; + } + + for (i=0; i +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +/* reform matrix performance test + */ +int main(int argc, char *argv[]) +{ + int i; + int m=64; + int n=128; + int mb, nb; + int spes=1; + int size; + volatile unsigned long long incomplete; + void *ptr; + double *A, *scratch; + double tbfreq, blocks_xfer; + unsigned long long ticks; + + + switch (argc) { + case 4: + spes = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + break; + default: + printf("Usage: %s [m [n [spes]]]\n", argv[0]); + return 1; + break; + } + + m = m & ~(64-1); + if (m < 64) m = 64; + + n = n & ~(64-1); + if (n < 64) n = 64; + + size = spes*64*(m-4); + + /* Allocate and initialize the arrays + */ + if (posix_memalign(&ptr, 128, m*n*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + A = (double *)ptr; + } + + if (posix_memalign(&ptr, 128, size*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + scratch = (double *)ptr; + } + + for (i=0; i +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" +#include +#include +#include +#include +#include + + +/* reform rows performance test + */ +int main(int argc, char *argv[]) +{ + int i; + int m=1; + int n=128; + int height=256; + int blk_col=0; + int n_padded; + int iterations=1; + int *rows; + volatile unsigned long long incomplete; + void *ptr; + double *A, *R; + double tbfreq; + char *env; + unsigned long long *ticks; + unsigned long long total; + double min, max, mean, std, delta; + + switch (argc) { + case 5: + blk_col = atoi(argv[4]); + case 4: + height = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + break; + default: + printf("Usage: %s [m [n [lda [blk_col]]]\n", argv[0]); + return 1; + break; + } + + blk_col &= (64-1); + + if (n < 1) n = 1; + n_padded = (n + blk_col + 63) & ~(63); + + if (height < m) height = m; + height = (height + 63) & ~63; + + if ((env = getenv("ITERATIONS"))) + iterations = atoi(env); + ticks = (unsigned long long *)malloc(iterations * sizeof(unsigned long long)); + + if (getenv("HUGE_TLBFS")) { + size_t memsize = 2*128 + m*sizeof(int) + (size_t)(height+m)*n_padded*sizeof(double); + size_t hugepagesize = 16*1024*1024; + int fd; + void *mem = NULL; + char filename[100]; + + sprintf(filename, "/huge/perf_reform_rows_%d.dat", getpid()); + + if ((fd = open (filename, O_CREAT | O_RDWR, 0755)) == -1) { + printf("open for huge page file %s failed (errno=%d): %s\n", filename, errno, strerror(errno)); + exit(1); + } else { + /* Delete file so that huge pages will get freed on program termination. */ + remove(filename); + + memsize = ( memsize + hugepagesize-1) & ~(hugepagesize-1); + + mem = mmap((void *)(0x100000000ULL), memsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + printf("mmap for %lld bytes in huge page file %s failed (errno=%d): %s\n", + (unsigned long long)memsize, filename, errno, strerror(errno)); + exit(1); + } + + A = (double *)ALIGN128(mem); + R = (double *)ALIGN128(A + height*n_padded); + rows = (int *)(R + m*n_padded); + + /* Closing the file descriptor does not unmap the region, so let's just take care of that right away */ + close (fd); + } + } else { + /* Allocate and initialize the arrays + */ + if (posix_memalign(&ptr, 128, height*n_padded*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + A = (double *)ptr; + } + + if (posix_memalign(&ptr, 128, m*n_padded*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + R = (double *)ptr; + } + rows = (int *)malloc(m*sizeof(int)); + } + + for (i=0; i max) max = ticks[i]; + } + std = sqrt(std/(double)(iterations)); + + printf("REFORM ROW (R_to_B) m=%d n=%d height=%d blk_col=%d MIN=%f MAX=%f MEAN=%f ticks Std Dev=%f Variance=%f%% rate=%f Gbytes/sec xfer=%f Gbytes/sec\n", m, n, height, blk_col, + min, max, mean, std, 100.0*((double)(max-min))/((double)mean), + (double)iterations * (double)m * (double)n * tbfreq * (double)sizeof(double) / ((double)total * 1.0e9), + (double)iterations * (double)m * (double)(2*n) * tbfreq * (double)sizeof(double) / ((double)total * 1.0e9)); + + /* Test BLOCK to ROW copy */ + + /* Perform 1 iteration first to pre-charge the PTEs + */ + hpl_accel_reform_rows_B_to_R(m, n, R, n_padded, A, M_SUB*height, rows, blk_col, (unsigned long long *)&incomplete); + while (incomplete); + + total = 0; + for (i=0; i max) max = ticks[i]; + } + std = sqrt(std/(double)(iterations)); + + printf("REFORM ROW (B_to_R) m=%d n=%d height=%d blk_col=%d MIN=%f MAX=%f MEAN=%f ticks Std Dev=%f Variance=%f%% rate=%f Gbytes/sec xfer=%f Gbytes/sec\n", m, n, height, blk_col, + min, max, mean, std, 100.0*((double)(max-min))/((double)mean), + (double)iterations * (double)m * (double)n * tbfreq * (double)sizeof(double) / ((double)total * 1.0e9), + (double)iterations * (double)m * (double)(2*n) * tbfreq * (double)sizeof(double) / ((double)total * 1.0e9)); + + + return 0; +} Index: accel/lib/tests/perf_reform_upanel.c =================================================================== RCS file: accel/lib/tests/perf_reform_upanel.c diff -N accel/lib/tests/perf_reform_upanel.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/perf_reform_upanel.c 20 Aug 2008 03:57:53 -0000 1.2 @@ -0,0 +1,80 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +/* reform u panel performance test + */ +int main(int argc, char *argv[]) +{ + int i; + int m=64; + int n=128; + int n_padded; + volatile unsigned long long incomplete; + void *ptr; + double *A, *panel; + double tbfreq; + unsigned long long ticks; + + + switch (argc) { + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + break; + default: + printf("Usage: %s [m [n]]\n", argv[0]); + return 1; + break; + } + + n_padded = (n + 15) & ~(15); + + /* Allocate and initialize the arrays + */ + if (posix_memalign(&ptr, 128, m*n_padded*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + A = (double *)ptr; + } + + if (posix_memalign(&ptr, 128, m*n*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + panel = (double *)ptr; + } + + for (i=0; i +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +#define EPSILON 0.0000001 + + +/* reform + */ +int main(int argc, char *argv[]) +{ + int i, j; + int col; + int errors; + int lda = 0; + int ldp = 0; + int n=128; + int m=128; + volatile unsigned long long incomplete; + double *A1, *A2, *panel, *scratch; + + switch (argc) { + case 5: + ldp = atoi(argv[3]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 1280; + n = rand() % 1280; + lda = rand() % 2048; + ldp = rand() % 2048; + break; + default: + printf("Usage: %s [m [n [lda [ldp]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + m = m & ~(128-1); + if (m < 128) m = 128; + + n = n & ~(128-1); + if (n < 128) n = 128; + + lda = lda & ~(15); + if (lda < m) lda = m; + + ldp = ldp & ~(15); + if (ldp < m) ldp = m; + + printf("Performing reform test with m=%d n=%d lda=%d ldp=%d\n", m, n, lda, ldp); + + /* Allocate and initialize the arrays + */ + A1 = (double *)allocate_matrix(n/64, lda*M_SUB, 128); + A2 = (double *)allocate_matrix(n/64, lda*M_SUB, 128); + scratch = (double *)allocate_panel(1, 128*ldp, 128); /* allocate 1 row so that no 4GB crossings occur */ + panel = (double *)allocate_panel(128, ldp, 128); + + for (i=0; i +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +/* reformat L panel + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int ldp = 0; + int lda = 0; + int n=64; + int m=64; + int n_padded; + volatile unsigned long long incomplete; + double *A, *P1, *P2; + + switch (argc) { + case 5: + ldp = atoi(argv[4]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 768; + n = rand() % 768; + lda = M_SUB * (rand() % 1536); + ldp = rand() % 1536; + break; + default: + printf("Usage: %s [m [n [lda [ldp]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + m = m & ~(64-1); + + ldp = (ldp + 15) & ~(15); + if (ldp < m) ldp = m; + if (ldp < 16) ldp = 16; + if (lda < m*M_SUB) lda = m*M_SUB; + + printf("Performing reform_lpanel test with m=%d n=%d lda=%d ldp=%d\n", m, n, lda, ldp); + + /* Allocate and initialize the arrays + */ + n_padded = (n + M_SUB-1) & ~(M_SUB-1); + + A = (double *)allocate_matrix(n_padded/M_SUB, lda, 128); + P1 = (double *)allocate_panel(n, ldp, 128); + P2 = (double *)allocate_panel(n, ldp, 128); + + if ((A == NULL) || (P1 == NULL) || (P2 == NULL)) { + printf("Failed to allocate buffers. Total allocation is %f MB. %p %p %p\n", (2.0*ldp*n + (double)lda*n_padded)*sizeof(double)/(1024.0*1024.0), A, P1, P2); + return 0; + } + + for (i=0; i=0; i--) { + if (P1[i] != P2[i]) { + errors++; + if (errors < 20) printf(" %d expected=%f got=%f\n", i, P1[i], P2[i]); + } + } + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/reform_matrix.c =================================================================== RCS file: accel/lib/tests/reform_matrix.c diff -N accel/lib/tests/reform_matrix.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/reform_matrix.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,97 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +/* reformat matrix + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int lda = 0; + int n=64; + int m=64; + int m_pad; + int size; + volatile unsigned long long incomplete; + double *A1, *A2, *scratch; + + switch (argc) { + case 5: + size = atoi(argv[4]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 1280; + n = rand() % 1280; + lda = rand() % 2048; + size = rand() % (8*m*64); + break; + default: + printf("Usage: %s [m [n [lda [size]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + n = n & ~(64-1); + if (n < 64) n = 64; + + if (m < 1) m = 1; + m_pad = (m + 63) & ~63; + + lda = lda & ~(64-1); + if (lda < m_pad) lda = m_pad; + + if (size < m_pad*64) size = m_pad*64; + size = (size + (128-1)) & ~(128-1); /* Pad the scratch buffer to a cacheline */ + + printf("Performing reform_matrix test with m=%d n=%d lda=%d size=%d\n", m, n, lda, size); + + /* Allocate and initialize the arrays + */ + A1 = (double *)allocate_matrix(n/M_SUB, lda*M_SUB, 128); + A2 = (double *)allocate_matrix(n/M_SUB, lda*M_SUB, 128); + scratch = (double *)allocate_panel(1, size, 128); + + for (i=0; i=0; i--) { + if (A1[i] != A2[i]) { + errors++; + if (errors < 20) printf(" %d expected=%f got=%f\n", i, A1[i], A2[i]); + } + } + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/reform_rows.c =================================================================== RCS file: accel/lib/tests/reform_rows.c diff -N accel/lib/tests/reform_rows.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/reform_rows.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,158 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +int rand_row(int max, int *rows, int cnt) +{ + int i; + int new_row; + int unique; + + do { + new_row = (max * (rand() & 0xFFFF)) >> 16; + unique = 1; + + for (i=0; i=0; i--) { + if (A1[i] != A2[i]) { + errors++; + if (errors < 20) printf("R->B %d expected=%f got=%f\n", i, A1[i], A2[i]); + } + } + + /* Test BLOCK to ROW copy */ + for (i=0; i=0; i--) { + if (R1[i] != R2[i]) { + errors++; + if (errors < 20) printf("B->R %d expected=%f got=%f\n", i, R1[i], R2[i]); + } + } + + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/reform_upanel.c =================================================================== RCS file: accel/lib/tests/reform_upanel.c diff -N accel/lib/tests/reform_upanel.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/reform_upanel.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,101 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + + + +/* reformat L panel + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int ldp = 0; + int lda = 0; + int n=64; + int m=64; + int m_padded, n_padded; + volatile unsigned long long incomplete; + double *A1, *A2, *P; + + switch (argc) { + case 5: + ldp = atoi(argv[4]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 700; + n = rand() % 700; + lda = M_SUB * (rand() % 1280); + ldp = rand() % 1280; + break; + default: + printf("Usage: %s [m [n [lda [ldp]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + m_padded = (m + M_SUB-1) & ~(M_SUB-1); + if (ldp < n) ldp = n; + ldp = (ldp + 15) & ~(15); + if (lda < m_padded*M_SUB) lda = m_padded*M_SUB; + lda = (lda + 15) & ~(15); + + printf("Performing reform_upanel test with m=%d n=%d lda=%d ldp=%d\n", m, n, lda, ldp); + + /* Allocate and initialize the arrays + */ + n_padded = (n + M_SUB-1) & ~(M_SUB-1); + + A1 = (double *)allocate_matrix(n_padded/M_SUB, lda, 128); + A2 = (double *)allocate_matrix(n_padded/M_SUB, lda, 128); + P = (double *)allocate_panel(m, ldp, 128); + + if ((A1 == NULL) || (A2 == NULL) || (P == NULL)) { + printf("Failed to allocate buffers. Total allocation is %f MB. %p %p %p\n", (2.0*lda*n_padded + (double)ldp*m)*sizeof(double)/(1024.0*1024.0), A1, A2, P); + return 0; + } + + for (i=0; i=0; i--) { + if (A1[i] != A2[i]) { + errors++; + if (errors < 20) printf(" %d expected=%f got=%f\n", i, A1[i], A2[i]); + } + } + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/regression =================================================================== RCS file: accel/lib/tests/regression diff -N accel/lib/tests/regression --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/regression 15 May 2008 01:36:22 -0000 1.4 @@ -0,0 +1,38 @@ +#!/bin/sh +# hpl_accel regression test suite + +let ITERATIONS=20 + +regress() { + let i=0 + + echo ">>> Regression testing" $1 "for" $ITERATIONS "iterations <<<" + + while ((i +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +int rand_row(int max) +{ + int new_row; + new_row = (max * (rand() & 0xFFFF)) >> 16; + return (new_row); +} + + +/* swap row + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int lda = 0; + int n=64; + int m=1; + int blk_col=0; + int n_padded, m_padded; + int *rows; + volatile unsigned long long incomplete; + double *A1, *A2; + + switch (argc) { + case 5: + blk_col = atoi(argv[4]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 64; + n = rand() % 700; + lda = M_SUB * (rand() % 800); + blk_col = rand() % 200; + break; + default: + printf("Usage: %s [m [n [lda [blk_col]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + if (m < 1) m = 1; + if (blk_col < 0) blk_col = 0; + lda = (lda + 15) & ~(15); + + printf("Performing swap_rows test with m=%d n=%d lda=%d blk_col=%d\n", m, n, lda, blk_col); + + /* Allocate and initialize the arrays + */ + m_padded = (m + M_SUB-1) & (~(M_SUB-1)); + n_padded = (n+blk_col + M_SUB-1) & ~(M_SUB-1); + if (lda < m_padded*M_SUB) lda = m_padded * M_SUB; + + A1 = (double *)allocate_matrix(m_padded, lda, 128); + A2 = (double *)allocate_matrix(m_padded, lda, 128); + + rows = (int *)allocate_panel(1, m * sizeof(int), 4); /* Never cross a 4GB boundary */ + + if ((A1 == NULL) || (A2 == NULL) || (rows == NULL)) { + printf("Failed to allocate buffers. Total allocation is %f MB. %p %p\n", (2.0*m*n_padded)/(1024.0*1024.0), A1, A2); + return 0; + } + + /* Test BLOCK to BLOCK copy */ + for (i=0; i=0; i--) { + if (A1[i] != A2[i]) { + errors++; + if (errors < 20) printf("B<->B %d expected=%f got=%f\n", i, A1[i], A2[i]); + } + } + + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/test_utils.h =================================================================== RCS file: accel/lib/tests/test_utils.h diff -N accel/lib/tests/test_utils.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/test_utils.h 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,147 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007 */ +/* */ +/* ---------------------------------------------------------------- */ + +#ifndef _TEST_UTILS_H_ +#define _TEST_UTILS_H_ + +#include +#include +#include +#include +#include +#include "hpl_accel.h" + +extern ssize_t getline(char **lineptr, size_t *n, FILE *stream); + +#define ALIGN128(_x) (((uintptr_t)(_x) + 127) & ~(127)) + +/* get_timebase_frequency + * ---------------------- + * Parse /proc/cpuinfo for the timebase frequency. + * This information is returned in Hz. If the data + * can not be obtained, then 0.0 is returned. + */ +#define CPU_INFO "/proc/cpuinfo" + +double get_timebase_frequency() +{ + FILE *fp; + double freq = 0.0; + char *line = NULL; + size_t len = 0; + ssize_t chrs_read; + + if ((fp = fopen(CPU_INFO, "r"))) { + while ((chrs_read = getline(&line, &len, fp)) != -1) { + if (sscanf(line, "timebase : %lf", &freq) == 1) { + if (strstr(line, "KHz")) { + freq *= 1.0e3; + } else if (strstr(line, "MHz")) { + freq *= 1.0e6; + } else if (strstr(line, "GHz")) { + freq *= 1.0e9; + } + break; + } + } + if (line) free(line); + fclose(fp); + } + return (freq); +} + +double byte_swap(double x) +{ +#ifdef ACCEL_LITTLE_ENDIAN + int i; + union { + double d; + unsigned char c[8]; + } in, out; + + in.d = x; + for (i=0; i<8; i++) out.c[i] = in.c[7-i]; + + return (out.d); +#else + return (x); +#endif +} + + +unsigned long long segment = 0x100000000ULL; + +void *allocate_panel(int rows, /* # of rows (row ordered) or colums (column ordered) */ + size_t row_size, /* # of doubles per row (row ordered) or column (column ordered) */ + int alignment) /* alignment of allocation */ +{ + void *ptr; + unsigned long long start, skip; + unsigned int row_crossing; + size_t size, padded_size; + int page_size; + + row_size *= sizeof(double); + size = rows * row_size; + page_size = getpagesize(); + padded_size = (size + (alignment-1) + (row_size - 1) + (page_size - 1)) & ~(page_size -1); + +#ifdef PANEL_4GB_CROSSING + row_crossing = ((rand() & 0xFFFF) * rows) >> 16; +#else + row_crossing = rows; +#endif + + do { + segment += 0x100000000ULL; + start = segment - ((unsigned long long)row_crossing * (unsigned long long)row_size); + skip = start & ((unsigned long long)page_size - 1); + ptr = mmap((void *)(start-skip), padded_size+skip, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + } while (ptr == MAP_FAILED); + + ptr += skip; + + return (ptr); +} + + +void *allocate_matrix(int blk_columns, /* number of block columns */ + size_t blk_column_size, /* # of doubles to stride a column of blocks */ + int alignment) /* alignment of allocation */ +{ + void *ptr; + unsigned long long start, skip; + unsigned int row_crossing, col_crossing, blks_per_col; + size_t size, padded_size; + int page_size; + + blk_column_size *= sizeof(double); + size = (size_t)blk_columns * blk_column_size; + page_size = getpagesize(); + padded_size = (size + (alignment-1) + (M_SUB * M_SUB * sizeof(double) - 1) + (page_size - 1)) & ~(page_size -1); + + blks_per_col = blk_column_size / (M_SUB * M_SUB * sizeof(double)); + +#ifdef MATRIX_4GB_CROSSING + row_crossing = ((rand() & 0xFFFF) * blks_per_col) >> 16; + col_crossing = ((rand() & 0xFFFF) * blk_columns) >> 16; +#else + row_crossing = blks_per_col; + col_crossing = blk_columns; +#endif + + do { + segment += 0x100000000ULL; + + start = segment - ((unsigned long long)(col_crossing) * (unsigned long long)(blk_column_size) + (unsigned long long)(row_crossing * M_SUB * M_SUB * sizeof(double))); + skip = start & ((unsigned long long)page_size - 1); + ptr = mmap((void *)(start-skip), padded_size+skip, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + } while (ptr == MAP_FAILED); + + ptr += skip; + return (ptr); +} + +#endif /* _TEST_UTILS_H_ */ Index: include/hpl.h =================================================================== RCS file: /cvsroot/hpl_qs22/include/hpl.h,v retrieving revision 1.1 retrieving revision 1.3 diff -u -r1.1 -r1.3 --- include/hpl.h 10 Feb 2008 21:45:50 -0000 1.1 +++ include/hpl.h 26 Aug 2008 13:24:26 -0000 1.3 @@ -43,6 +43,9 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ #ifndef HPL_H #define HPL_H /* @@ -82,6 +85,8 @@ #include "hpl_panel.h" #include "hpl_pfact.h" #include "hpl_pgesv.h" + +#include "hpl_accel.h" #include "hpl_timer.h" #include "hpl_matgen.h" Index: include/hpl_accel.h =================================================================== RCS file: include/hpl_accel.h diff -N include/hpl_accel.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ include/hpl_accel.h 20 Aug 2008 03:57:53 -0000 1.13 @@ -0,0 +1,61 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#ifndef HPL_ACCEL_H +#define HPL_ACCEL_H + +#ifdef HPL_CALL_ACCEL + +/* ---------------------------------------------------------------- */ +/* Include files */ +/* ---------------------------------------------------------------- */ + +#include "hpl_grid.h" /* HPL_T_grid */ +#include "hpl_pgesv.h" /* HPL_T_pmat */ +#include "hpl_panel.h" /* HPL_T_panel */ + +#include "../accel/lib/hpl_accel.h" + +/* ---------------------------------------------------------------- */ +/* Macros */ +/* ---------------------------------------------------------------- */ + +#define CROSSES_4GB_BOUNDARY(p,len) \ + ( (((size_t)(p))>>32) != (((size_t)(p)+(len)-1)>>32) ) + +#define FIX_4GB_BOUNDARY_CROSSING(p,len) \ + ( CROSSES_4GB_BOUNDARY(p,len) ? ((((size_t)(p)+(len)-1)>>32)<<32) : (size_t)(p) ) + +/* ---------------------------------------------------------------- */ +/* Function prototypes */ +/* ---------------------------------------------------------------- */ + +extern int HPL_accel_init(int my_rank); +extern int HPL_accel_exit(int my_rank); +extern int HPL_accel_pgesv_init( HPL_T_grid *GRID, HPL_T_palg *ALGO, HPL_T_pmat *A); +extern int HPL_accel_pgesv_fini( HPL_T_grid *GRID, HPL_T_palg *ALGO, HPL_T_pmat *A); +extern int HPL_accel_pangetL( HPL_T_panel *PANEL); +extern int HPL_accel_panputU( HPL_T_panel *PANEL, double *data, int ld, int *rows, int nn); +extern int HPL_accel_rowget( HPL_T_panel *PANEL, double *data, int ld, int numrows, int *rows, int jj, int nn); +extern int HPL_accel_rowput( HPL_T_panel *PANEL, double *data, int ld, int numrows, int *rows, int jj, int nn); +extern int HPL_accel_dtrsm( HPL_T_panel *PANEL, int j1, int nn); +extern int HPL_accel_dgemm( HPL_T_panel *PANEL, int j1, int nn); +extern int HPL_accel_dgemm_async( HPL_T_panel *PANEL, int j1, int nn); +extern int HPL_accel_dgemm_wait( HPL_T_panel *PANEL); +extern void HPL_accel_dgemmCL(int m, int n, int k, const double *a, int lda, const double *b, int ldb, double *c, int ldc); +extern int HPL_accel_swap00N( HPL_T_panel *PANEL, const int *IPIV, int j1, int nn); +extern int HPL_accel_swap01T( HPL_T_panel *PANEL, const int *LINDXA, const int *LINDXAU, const int numrows, const int nn); +extern int HPL_accel_swap02N( HPL_T_panel *PANEL, const int *LINDXA, const int *LINDXAU, const int numrows, double *W0, double *W, const int ldw, const int nn); +extern int HPL_accel_swap04T( HPL_T_panel *PANEL, const int *LINDXA, const int *LINDXAU, const int numrows, const int numrows2, double *W0, double *W, const int ldw, const int nn); +extern int HPL_accel_swap05T( HPL_T_panel *PANEL, const int *LINDXA, const int *LINDXAU, const int numrows, const int nn); +extern int HPL_accel_swap06T( HPL_T_panel *PANEL, const int *LINDXA, const int numrows, const int i0, const int nn); + +#else + +#define FIX_4GB_BOUNDARY_CROSSING(p,len) ((size_t)(p)) + +#endif /* HPL_CALL_ACCEL */ + +#endif /* HPL_ACCEL_H */ Index: include/hpl_auxil.h =================================================================== RCS file: /cvsroot/hpl_qs22/include/hpl_auxil.h,v retrieving revision 1.1 retrieving revision 1.3 diff -u -r1.1 -r1.3 --- include/hpl_auxil.h 10 Feb 2008 21:45:50 -0000 1.1 +++ include/hpl_auxil.h 26 Aug 2008 13:24:26 -0000 1.3 @@ -43,6 +43,9 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ #ifndef HPL_AUXIL_H #define HPL_AUXIL_H /* @@ -139,6 +142,15 @@ double HPL_dlamch STDC_ARGS( ( const HPL_T_MACH +) ); + +void* HPL_hpalloc +STDC_ARGS( ( + size_t +) ); +void HPL_hpfree +STDC_ARGS( ( + void * ) ); #endif Index: include/hpl_panel.h =================================================================== RCS file: /cvsroot/hpl_qs22/include/hpl_panel.h,v retrieving revision 1.1 retrieving revision 1.3 diff -u -r1.1 -r1.3 --- include/hpl_panel.h 10 Feb 2008 21:45:50 -0000 1.1 +++ include/hpl_panel.h 26 Aug 2008 13:24:26 -0000 1.3 @@ -43,6 +43,9 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ #ifndef HPL_PANEL_H #define HPL_PANEL_H /* @@ -90,6 +93,7 @@ int pcol; /* proc. col owning 1st col of trail. A */ int msgid; /* message id for panel bcast */ int ldl2; /* local leading dim of array L2 */ + int ldu; /* local leading dim of array U */ int len; /* length of the buffer to broadcast */ #ifdef HPL_CALL_VSIPL vsip_block_d * Ablock; /* A block */ Index: include/hpl_pgesv.h =================================================================== RCS file: /cvsroot/hpl_qs22/include/hpl_pgesv.h,v retrieving revision 1.1 retrieving revision 1.3 diff -u -r1.1 -r1.3 --- include/hpl_pgesv.h 10 Feb 2008 21:45:50 -0000 1.1 +++ include/hpl_pgesv.h 26 Aug 2008 13:24:26 -0000 1.3 @@ -43,6 +43,9 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ #ifndef HPL_PGESV_H #define HPL_PGESV_H /* @@ -71,6 +74,7 @@ HPL_SWAP01 = 452, /* Use HPL_pdlaswp01 */ HPL_SW_MIX = 453, /* Use HPL_pdlaswp00_ for small number of */ /* columns, and HPL_pdlaswp01_ otherwise. */ + HPL_SWAP03 = 454, /* Use HPL_pdlaswp03 */ HPL_NO_SWP = 499 } HPL_T_SWAP; @@ -338,6 +342,14 @@ STDC_ARGS( ( HPL_T_grid *, HPL_T_pmat * +) ); + +void HPL_pdlaswp03T +STDC_ARGS( ( + HPL_T_panel *PBCST, + int *IFLAG, + HPL_T_panel *PANEL, + const int NN ) ); #endif Index: include/hpl_ptest.h =================================================================== RCS file: /cvsroot/hpl_qs22/include/hpl_ptest.h,v retrieving revision 1.1 retrieving revision 1.4 diff -u -r1.1 -r1.4 --- include/hpl_ptest.h 10 Feb 2008 21:45:50 -0000 1.1 +++ include/hpl_ptest.h 26 Aug 2008 13:24:26 -0000 1.4 @@ -43,6 +43,9 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ #ifndef HPL_PTEST_H #define HPL_PTEST_H /* @@ -93,13 +96,17 @@ */ #ifdef HPL_DETAILED_TIMING #define HPL_TIMING_BEG 11 /* timer 0 reserved, used by main */ -#define HPL_TIMING_N 6 /* number of timers defined below */ +#define HPL_TIMING_N (HPL_TIMING_END-HPL_TIMING_BEG) /* number of timers defined below */ #define HPL_TIMING_RPFACT 11 /* starting from here, contiguous */ #define HPL_TIMING_PFACT 12 #define HPL_TIMING_MXSWP 13 #define HPL_TIMING_UPDATE 14 #define HPL_TIMING_LASWP 15 #define HPL_TIMING_PTRSV 16 +#define HPL_TIMING_ACCEL_OVERHEAD 17 +#define HPL_TIMING_ALLGATHER 18 +#define HPL_TIMING_SCATTER 19 +#define HPL_TIMING_END 20 #endif /* * --------------------------------------------------------------------- Index: makes/Make.accel =================================================================== RCS file: makes/Make.accel diff -N makes/Make.accel --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ makes/Make.accel 20 Aug 2008 03:57:53 -0000 1.6 @@ -0,0 +1,74 @@ +# --------------------------------------------------------------- +# (C) Copyright IBM Corporation 2007,2008 +# +# --------------------------------------------------------------- + +include Make.inc + +######################################################################## +# Target +######################################################################## + +ifeq ($(arch),qs22) + +ACCEL_OBJS = HPL_accel_init.o HPL_accel_exit.o \ + HPL_accel_pgesv.o HPL_accel_swap.o \ + HPL_accel_rowget.o HPL_accel_rowput.o \ + HPL_accel_panget.o HPL_accel_panput.o \ + HPL_accel_dgemm.o HPL_accel_dtrsm.o + +all : lib + +lib : lib.grd + +lib.grd : $(ACCEL_OBJS) + $(ARCHIVER) $(ARFLAGS) $(HPLlib) $(ACCEL_OBJS) + $(RANLIB) $(HPLlib) + $(TOUCH) lib.grd + +else + +all : + +endif + +######################################################################## +# Local Defines +######################################################################## + +CCFLAGS += -I$(TOPdir)/accel + +INCdep = $(INCdir)/hpl_accel.h ../HPL_accel_private.h \ + $(TOPdir)/accel/lib/hpl_accel.h + +######################################################################## +# Build Rules +######################################################################## + +HPL_accel_init.o : ../HPL_accel_init.c $(INCdep) + $(CC) -o $@ -c $(CCFLAGS) $< +HPL_accel_exit.o : ../HPL_accel_exit.c $(INCdep) + $(CC) -o $@ -c $(CCFLAGS) $< +HPL_accel_pgesv.o : ../HPL_accel_pgesv.c $(INCdep) + $(CC) -o $@ -c $(CCFLAGS) $< +HPL_accel_swap.o : ../HPL_accel_swap.c $(INCdep) + $(CC) -o $@ -c $(CCFLAGS) $< +HPL_accel_rowget.o : ../HPL_accel_rowget.c $(INCdep) + $(CC) -o $@ -c $(CCFLAGS) $< +HPL_accel_rowput.o : ../HPL_accel_rowput.c $(INCdep) + $(CC) -o $@ -c $(CCFLAGS) $< +HPL_accel_panget.o : ../HPL_accel_panget.c $(INCdep) + $(CC) -o $@ -c $(CCFLAGS) $< +HPL_accel_panput.o : ../HPL_accel_panput.c $(INCdep) + $(CC) -o $@ -c $(CCFLAGS) $< +HPL_accel_dtrsm.o : ../HPL_accel_dtrsm.c $(INCdep) + $(CC) -o $@ -c $(CCFLAGS) $< +HPL_accel_dgemm.o : ../HPL_accel_dgemm.c $(INCdep) + $(CC) -o $@ -c $(CCFLAGS) $< + +######################################################################## +# Clean Rules +######################################################################## + +clean : + $(RM) *.o Index: makes/Make.auxil =================================================================== RCS file: /cvsroot/hpl_qs22/makes/Make.auxil,v retrieving revision 1.1 retrieving revision 1.4 diff -u -r1.1 -r1.4 --- makes/Make.auxil 10 Feb 2008 21:45:50 -0000 1.1 +++ makes/Make.auxil 26 Aug 2008 13:24:26 -0000 1.4 @@ -43,6 +43,8 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ###################################################################### +# Modifications (C) Copyright IBM Corporation 2008 +# ###################################################################### # include Make.inc # @@ -58,7 +60,7 @@ HPL_warn.o HPL_abort.o HPL_dlaprnt.o \ HPL_dlange.o HPL_au1obj = \ - HPL_dlamch.o + HPL_dlamch.o HPL_hpalloc.o HPL_auxobj = \ $(HPL_au0obj) $(HPL_au1obj) # @@ -91,6 +93,8 @@ $(CC) -o $@ -c $(CCFLAGS) ../HPL_dlange.c HPL_dlamch.o : ../HPL_dlamch.c $(INCdep) $(CC) -o $@ -c $(CCNOOPT) ../HPL_dlamch.c +HPL_hpalloc.o : ../HPL_hpalloc.c $(INCdep) + $(CC) -o $@ -c $(CCFLAGS) $< # # ###################################################################### # Index: makes/Make.comm =================================================================== RCS file: /cvsroot/hpl_qs22/makes/Make.comm,v retrieving revision 1.1 retrieving revision 1.3 diff -u -r1.1 -r1.3 Index: makes/Make.panel =================================================================== RCS file: /cvsroot/hpl_qs22/makes/Make.panel,v retrieving revision 1.1 retrieving revision 1.3 diff -u -r1.1 -r1.3 --- makes/Make.panel 10 Feb 2008 21:45:50 -0000 1.1 +++ makes/Make.panel 26 Aug 2008 13:24:26 -0000 1.3 @@ -43,6 +43,8 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ###################################################################### +# Modifications (C) Copyright IBM Corporation 2008 +# ###################################################################### # include Make.inc # @@ -52,7 +54,7 @@ $(INCdir)/hpl_misc.h $(INCdir)/hpl_blas.h $(INCdir)/hpl_auxil.h \ $(INCdir)/hpl_pmisc.h $(INCdir)/hpl_grid.h $(INCdir)/hpl_comm.h \ $(INCdir)/hpl_pauxil.h $(INCdir)/hpl_panel.h $(INCdir)/hpl_pfact.h \ - $(INCdir)/hpl_pgesv.h + $(INCdir)/hpl_pgesv.h $(INCdir)/hpl_accel.h # ## Object files ######################################################## # Index: makes/Make.pgesv =================================================================== RCS file: /cvsroot/hpl_qs22/makes/Make.pgesv,v retrieving revision 1.1 retrieving revision 1.3 diff -u -r1.1 -r1.3 --- makes/Make.pgesv 10 Feb 2008 21:45:50 -0000 1.1 +++ makes/Make.pgesv 26 Aug 2008 13:24:26 -0000 1.3 @@ -43,6 +43,8 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ###################################################################### +# Modifications (C) Copyright IBM Corporation 2008 +# ###################################################################### # include Make.inc # @@ -64,7 +66,8 @@ HPL_equil.o HPL_pdlaswp01N.o HPL_pdlaswp01T.o \ HPL_pdupdateNN.o HPL_pdupdateNT.o HPL_pdupdateTN.o \ HPL_pdupdateTT.o HPL_pdtrsv.o HPL_pdgesv0.o \ - HPL_pdgesvK1.o HPL_pdgesvK2.o HPL_pdgesv.o + HPL_pdgesvK1.o HPL_pdgesvK2.o HPL_pdgesv.o \ + HPL_pdlaswp03T.o # ## Targets ############################################################# # @@ -127,6 +130,8 @@ $(CC) -o $@ -c $(CCFLAGS) ../HPL_pdgesvK2.c HPL_pdgesv.o : ../HPL_pdgesv.c $(INCdep) $(CC) -o $@ -c $(CCFLAGS) ../HPL_pdgesv.c +HPL_pdlaswp03T.o : ../HPL_pdlaswp03T.c $(INCdep) + $(CC) -o $@ -c $(CCFLAGS) $< # # ###################################################################### # Index: makes/Make.ptest =================================================================== RCS file: /cvsroot/hpl_qs22/makes/Make.ptest,v retrieving revision 1.1 retrieving revision 1.3 diff -u -r1.1 -r1.3 --- makes/Make.ptest 10 Feb 2008 21:45:50 -0000 1.1 +++ makes/Make.ptest 26 Aug 2008 13:24:26 -0000 1.3 @@ -43,6 +43,8 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ###################################################################### +# Modifications (C) Copyright IBM Corporation 2008 +# ###################################################################### # include Make.inc # @@ -72,7 +74,7 @@ $(BINdir)/HPL.dat : ../HPL.dat ( $(CP) ../HPL.dat $(BINdir) ) # -dexe.grd: $(HPL_pteobj) $(HPLlib) +dexe.grd: $(HPL_pteobj) $(HPLlib) $(ACLlib) $(LINKER) $(LINKFLAGS) -o $(xhpl) $(HPL_pteobj) $(HPL_LIBS) $(MAKE) $(BINdir)/HPL.dat $(TOUCH) dexe.grd Index: src/accel/HPL_accel_dgemm.c =================================================================== RCS file: src/accel/HPL_accel_dgemm.c diff -N src/accel/HPL_accel_dgemm.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/accel/HPL_accel_dgemm.c 20 Aug 2008 03:57:54 -0000 1.15 @@ -0,0 +1,241 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include "hpl.h" + +#include "HPL_accel_private.h" + +static unsigned long long completion_flags = 0; + +/* ---------------------------------------------------------------- */ +int HPL_accel_dgemm_async +( + HPL_T_panel *PANEL, /* Panel structure for inputs to dgemm */ + int j1, /* Relative index of first column of B and C inputs */ + int nn /* number of columns of rhs input in the row_buf */ +) +/* + * Purpose + * ======= + * + * HPL_accel_dgemm initiates a dgemm on the accelerator. The lower triangular + * matrix input to dgemm is in the blk_buf, and the right hand sides are in + * the row_buf. + * + * ----------------------------------------------------------------- + */ +{ + int ii = PANEL->ii; /* local row index (zero based) of first row of C matrix */ + int jj = PANEL->jj; /* local column index (zero based) of first column of C matrix */ + int jb = PANEL->jb; /* number of panel columns / row buffer rows */ + int mp = PANEL->mp; /* number of local rows of panel */ + int lda = PANEL->lda; /* local leading dimension of matrix */ + int ldl2 = PANEL->ldl2; /* local leading dimension of L2 panel */ + + if ( panel_prep >= (PANEL->jj+j1) ) { + if ( PANEL->grid->myrow == PANEL->prow ) { + + HPL_dgemm(HplColumnMajor, HplNoTrans, HplNoTrans, mp-jb, nn, jb, -HPL_rone, + PANEL->L2, ldl2, Mptr(PANEL->A, 0, j1, lda), lda, HPL_rone, + Mptr(PANEL->A, jb, j1, lda), lda); + + } else { + int ldu = PANEL->ldu; + + HPL_dgemm(HplColumnMajor, HplNoTrans, HplTrans, mp, nn, jb, -HPL_rone, + PANEL->L2, ldl2, PANEL->U+j1, ldu, HPL_rone, + Mptr(PANEL->A, 0, j1, lda), lda); + + } + return 0; + } + + TRACE_PRINT("%s [%d] entry ii=%d jj=%d jb=%d j1=%d nn=%d\n", + __FUNCTION__, PANEL->grid->iam, ii, jj, jb, j1, nn); + + /* If this row of processors holds the U panel, then the number of rows to + update is JB rows less than the number of rows in the trailing submatrix. */ + + if ( PANEL->grid->myrow == PANEL->prow ) { + ii += jb; + mp -= jb; + } + + if (mp <= 0 || nn <= 0) { return 0; } + + int mp_pad = ((mp+M_SUB-1)/M_SUB)*M_SUB; + + /* Pad out rows to get acceleration when appropriate */ + if ( (mp>=M_SUB) && (nn>=M_SUB) ) + mp = (mp_pad > ldl2) ? mp : mp_pad; + + /* The L2 panel may reside in the matrix, and thus could cross a 4GB boundary + for large problem sizes. When this happens, we just copy the L2 panel over + to one of our extra panel buffers. */ + + double *L2 = PANEL->L2; /* L2 panel */ + if ( CROSSES_4GB_BOUNDARY(L2, ldl2*jb*sizeof(double)) ) { + memcpy(pan_buf[1], L2, ldl2*jb*sizeof(double)); + L2 = pan_buf[1]; + } + + if ( PANEL->grid->myrow == PANEL->prow ) { + int ldu = lda*M_SUB; + double *U = &PANEL->pmat->A[INDEX_BLK(PANEL->ii, 0, ldu)]; + + hpl_accel_dgemm_CL_B_B_CL( + /* IN (int) Number of rows in a, c, and panel */ mp, + /* IN (int) Number of cols in b, c, and panel */ nn, + /* IN (int) Number of cols in a and rows in b */ jb, + /* IN (double*) matrix a is the L2 panel */ L2, + /* IN (int) Leading dimension of L2 */ ldl2, + /* IN (double*) matrix b (U), block row format */ U, + /* IN (int) Leading dimension of b (U) */ ldu, + /* INOUT (double*) c matrix is trailing matrix */ PANEL->pmat->A, + /* IN (int) Leading dimension of trailing matrix + (number of doubles to advance from block column + n to block column n+1. (i.e., from column n to + column n+M_SUB) */ lda*M_SUB, + /* IN (uint) Starting block matrix row offset */ ii, + /* IN (uint) Starting block matrix column offset */ jj+j1, + /* INOUT (double *) panel to hold result or NULL */ NULL, + /* IN (int) leading dimension of panel */ 0, + /* IN (unsigned long long *) Completion variable */ &completion_flags); + + } else { + + hpl_accel_dgemm_CL_R_B_CL( + /* IN (int) Number of rows in a, c, and panel */ mp, + /* IN (int) Number of cols in b, c, and panel */ nn, + /* IN (int) Number of cols in a and rows in b */ jb, + /* IN (double*) matrix a is the L2 panel */ L2, + /* IN (int) Leading dimension of L2 */ ldl2, + /* IN (double*) b matrix is the U panel */ PANEL->U, + /* IN (int) Leading dimension U panel */ PANEL->ldu, + /* INOUT (double*) c matrix is trailing matrix */ PANEL->pmat->A, + /* IN (int) Leading dimension of trailing matrix + (number of doubles to advance from block column + n to block column n+1. (i.e., from column n to + column n+M_SUB) */ lda*M_SUB, + /* IN (uint) Starting block matrix row offset */ ii, + /* IN (uint) Starting block matrix column offset */ jj+j1, + /* INOUT (double *) panel to hold result or NULL */ NULL, + /* IN (int) leading dimension of panel */ 0, + /* IN (unsigned long long *) Completion variable */ &completion_flags); + } + + return 0; +} + +/* ---------------------------------------------------------------- */ +int HPL_accel_dgemm_wait +( + HPL_T_panel *PANEL /* Panel structure for inputs to dgemm */ +) +/* + * Purpose + * ======= + * + * HPL_accel_dgemm initiates a dgemm on the accelerator. The lower triangular + * matrix input to dgemm is in the blk_buf, and the right hand sides are in + * the row_buf. + * + * ----------------------------------------------------------------- + */ +{ + wait_for(&completion_flags); + + return 0; +} + +/* ---------------------------------------------------------------- */ +int HPL_accel_dgemm +( + HPL_T_panel *PANEL, /* Panel structure for inputs to dgemm */ + int j1, /* Relative index of first column of B and C inputs */ + int nn /* number of columns of rhs input in the row_buf */ +) +/* + * Purpose + * ======= + * + * HPL_accel_dgemm initiates a dgemm on the accelerator. The lower triangular + * matrix input to dgemm is in the blk_buf, and the right hand sides are in + * the row_buf. + * + * ----------------------------------------------------------------- + */ +{ + HPL_accel_dgemm_async(PANEL, j1, nn); + + HPL_accel_dgemm_wait(PANEL); + + return 0; +} + +/* ---------------------------------------------------------------- */ +void HPL_accel_dgemmCL +( + int m, /* Number of rows in a and c */ + int n, /* Number of cols in b and c */ + int k, /* Number of cols in a and rows in b */ + const double *a, /* Column ordered a matrix */ + int lda, /* Leading dimension of a matrix */ + const double *b, /* Column ordered b matrix */ + int ldb, /* Leading dimension of b matrix */ + double *c, /* Column ordered c matrix */ + int ldc /* Leading dimension of c matrix */ +) +/* + * Purpose + * ======= + * + * HPL_accel_dgemmCL performs a dgemm on the accelerator. + * + * ----------------------------------------------------------------- + */ +{ + if ( k<=0 || n<=0 ) { return; } + + /* The input panels may reside in the matrix, and thus could cross a 4GB + boundary for large problem sizes. When this happens, we copy the + a panels over to one of our extra panel buffers. */ + + int i; + double *a_panel = (double *)a; + double *c_panel = (double *)c; + + if ( CROSSES_4GB_BOUNDARY(a, (k*lda-1)*sizeof(double)) ) { + a_panel = pan_buf[0]; + for (i=0; i= 8) { + int n1 = n / 2, n2 = n - n1; + HPL_accel_dgemmCL(m, n1, k, a, lda, b, ldb, c, ldc); + HPL_accel_dgemmCL(m, n2, k, a, lda, b+n1*ldb, ldb, c+n1*ldc, ldc); + return; + } + + c_panel = pan_buf[1]; + for (i=0; ijb; + int lda = PANEL->lda; + int ldl1 = PANEL->jb; + int ldu = PANEL->ldu; + + if ( panel_prep >= PANEL->jj ) { + if ( PANEL->U == NULL ) { + HPL_dtrsm( HplColumnMajor, HplLeft, HplLower, HplNoTrans, + HplUnit, jb, nn, HPL_rone, PANEL->L1, jb, Mptr(PANEL->A, 0, j1, PANEL->lda), lda ); + } else { + HPL_dtrsm( HplColumnMajor, HplRight, HplLower, HplTrans, + HplUnit, nn, jb, HPL_rone, PANEL->L1, jb, PANEL->U+j1, ldu ); + + if ( PANEL->grid->myrow == PANEL->prow ) { + HPL_dlatcpy( jb, nn, PANEL->U+j1, ldu, Mptr(PANEL->A, 0, j1, PANEL->lda), lda ); + } + } + return 0; + } + + TRACE_PRINT("%s [%d] entry ii=%d jj=%d jb=%d nn=%d\n", + __FUNCTION__, PANEL->grid->iam, PANEL->ii, PANEL->jj, PANEL->jb, nn); + + unsigned long long completion_flags = 0; + + /* If this processor is in the row of processors that holds the + top row of the trailing matrix, set the C matrix pointer and ldc + so that the dtrsm result goes straight into the matrix. Note that + DGEMM must be sure to use the data in the matrix rather than out of + the U buffer in this case. */ + + if ( PANEL->U == NULL ) { + + /* When there is no U buffer, that means that the rows are stored in the matrix. */ + + hpl_accel_dtrsm_CL_B( + /* IN (int) Number of rows in L1 matrix and rhs matrix b */ jb, + /* IN (int) Number of columns in rhs matrix b */ nn, + /* IN (double*) L1 matrix */ PANEL->L1, + /* IN (int) Leading dimension of L1 */ ldl1, + /* INOUT (double*) c matrix - alternate result area */ PANEL->pmat->A, + /* IN (int) Leading dimension of c matrix */ lda*M_SUB, + /* IN (int) Block row */ PANEL->ii, + /* IN (int) Block col */ PANEL->jj+j1, + /* IN (unsigned long long *) Completion variable */ &completion_flags); + + } else if ( PANEL->grid->myrow == PANEL->prow ) { + + hpl_accel_dtrsm_CL_R_B( + /* IN (int) Number of rows in L1 matrix and rhs matrix b */ jb, + /* IN (int) Number of columns in rhs matrix b */ nn, + /* IN (double*) L1 matrix */ PANEL->L1, + /* IN (int) Leading dimension of L1 */ ldl1, + /* INOUT (double*) rhs matrix */ PANEL->U+j1, + /* IN (int) Leading dimension of rhs matrix */ ldu, + /* INOUT (double*) c matrix - alternate result area */ PANEL->pmat->A, + /* IN (int) Leading dimension of c matrix */ lda*M_SUB, + /* IN (int) Block row */ PANEL->ii, + /* IN (int) Block col */ PANEL->jj+j1, + /* IN (unsigned long long *) Completion variable */ &completion_flags); + + } else { + hpl_accel_dtrsm_CL_R_B( + /* IN (int) Number of rows in L1 matrix and rhs matrix b */ jb, + /* IN (int) Number of columns in rhs matrix b */ nn, + /* IN (double*) L1 matrix */ PANEL->L1, + /* IN (int) Leading dimension of L1 */ ldl1, + /* INOUT (double*) rhs matrix */ PANEL->U+j1, + /* IN (int) Leading dimension of rhs matrix */ ldu, + /* INOUT (double*) c matrix - alternate result area */ NULL, + /* IN (int) Leading dimension of c matrix */ 0, + /* IN (int) Block row */ 0, + /* IN (int) Block col */ 0, + /* IN (unsigned long long *) Completion variable */ &completion_flags); + } + + wait_for(&completion_flags); + + return 0; +} Index: src/accel/HPL_accel_exit.c =================================================================== RCS file: src/accel/HPL_accel_exit.c diff -N src/accel/HPL_accel_exit.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/accel/HPL_accel_exit.c 20 Aug 2008 03:57:54 -0000 1.2 @@ -0,0 +1,17 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include "hpl.h" +#include "HPL_accel_private.h" + +/* ---------------------------------------------------------------- */ +/* HPL_accel_exit */ +/* ---------------------------------------------------------------- */ +int HPL_accel_exit(int my_rank) +{ + TRACE_PRINT("\nHPL_accel_exit: Done!\n"); + + return(0); +} Index: src/accel/HPL_accel_init.c =================================================================== RCS file: src/accel/HPL_accel_init.c diff -N src/accel/HPL_accel_init.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/accel/HPL_accel_init.c 20 Aug 2008 03:57:54 -0000 1.4 @@ -0,0 +1,31 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include "hpl.h" + +#include "HPL_accel_private.h" + +/* ---------------------------------------------------------------- */ +/* HPL_accel_init */ +/* ---------------------------------------------------------------- */ +int HPL_accel_init(int my_rank) +{ + int lib_rc; + + /* Start the accelerator process */ + TRACE_PRINT("\nHPL_accel_init[%d]: Starting accelerator process.\n", my_rank); + + /* Initialize the accelerator library */ + lib_rc = hpl_accel_init(); + if (lib_rc != HPL_ACCEL_INIT_SUCCESS) { + fprintf(stdout,"\nHPL_accel_init[%d]: hpl_accel_init failed.\n", my_rank); + fflush(stdout); + return -1; + } + + TRACE_PRINT("\nHPL_accel_init[%d]: The accelerator process started.\n", my_rank); + + return 0; +} Index: src/accel/HPL_accel_panget.c =================================================================== RCS file: src/accel/HPL_accel_panget.c diff -N src/accel/HPL_accel_panget.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/accel/HPL_accel_panget.c 20 Aug 2008 03:57:54 -0000 1.4 @@ -0,0 +1,76 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include "hpl.h" + +#include "HPL_accel_private.h" + +/* ---------------------------------------------------------------- */ +int HPL_accel_pangetL +( + HPL_T_panel *PANEL /* Panel structure specifying the panel to reformat */ +) +/* + * Purpose + * ======= + * + * HPL_accel_pangetL reformats the specified panel from blocked-row + * format back to column major format. This operation is performed + * prior to panel factoization on the host. + * + * ---------------------------------------------------------------- + */ +{ + int jj = PANEL->jj; /* local column index (zero based) of first column of panel */ + int lda = PANEL->lda; /* local leading dimension of matrix */ + + /* If this node does not contain any rows of the matrix, then just return */ + if (PANEL->pmat->mp <= 0) { return 0; } + + /* If this panel has already been prepped, then just return */ + if ( panel_prep >= jj ) { return 0; } + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer( HPL_TIMING_ACCEL_OVERHEAD ); +#endif + + TRACE_PRINT("%s [%d] entry ii=%d jj=%d jb=%d\n", + __FUNCTION__, my_rank, PANEL->ii, PANEL->jj, PANEL->jb); + + /* Prepare the next block column to be factored, which means convert it + to column major. We do this using our special extra panel buffer. */ + + int reform_cols = Mmin(PANEL->nq, PANEL->nb); + + /* Block column in matrix to receive prepped panel */ + double *panel = PANEL->pmat->A+INDEX_BLK(0, jj, lda*M_SUB); + double *pbuf = pan_buf[0]; + + unsigned long long completion_flags = 0; + + hpl_accel_reform_panel_B_to_CL( + /* IN (int) Number of rows of matrix a to copy to panel */ PANEL->pmat->mp, + /* IN (int) Number of columns of matrix a to copy to panel */ reform_cols, + /* OUT (double*) Panel buffer to receive the reformatted panel */ pbuf, + /* IN (int) Leading dimension of panel */ lda, + /* IN (double*) Block formatted matrix */ panel, + /* IN (int) Leading dimension of matrix a. The number of doubles to + advance from block column n to block column n+1. (i.e., from + column n to column n+M_SUB) */ M_SUB*lda, + /* IN (unsigned long long *) Completion variable */ &completion_flags); + + wait_for(&completion_flags); + + memcpy(panel, pbuf, reform_cols*lda*sizeof(double)); + + /* Columns up to panel_prep have been converted to column major */ + panel_prep += reform_cols; + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer( HPL_TIMING_ACCEL_OVERHEAD ); +#endif + + return 0; +} Index: src/accel/HPL_accel_panput.c =================================================================== RCS file: src/accel/HPL_accel_panput.c diff -N src/accel/HPL_accel_panput.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/accel/HPL_accel_panput.c 20 Aug 2008 03:57:54 -0000 1.4 @@ -0,0 +1,53 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include "hpl.h" + +#include "HPL_accel_private.h" + +/* ---------------------------------------------------------------- */ +int HPL_accel_panputU +( + HPL_T_panel *PANEL, /* Panel structure containing data to be + sent to the accelerator */ + double *data, /* area containing the data(in row maj format) to + be copied to the row buffer */ + int ld, /* leading dimension of data (amount to advance + from row i to row i+1) */ + int *rows, /* array of local row indices to be copied */ + int nn /* number of columns of each row to be copied to accel */ +) +/* + * Purpose + * ======= + * + * HPL_accel_panputU copies data from host storage in row-major format to + * the row buffer on the accelerator, also in row-major format. No + * endianness conversions are done (the data is already big-endian). + * + * ----------------------------------------------------------------- + */ +{ + if (PANEL->mp<=0) { return 0; } + + TRACE_PRINT("%s [%d] entry ii=%d jj=%d jb=%d nn=%d\n", + __FUNCTION__, PANEL->grid->iam, PANEL->ii, PANEL->jj, PANEL->jb, nn); + + unsigned long long completion_flags = 0; + + hpl_accel_copy_rows_R_to_R( + /* (int) number of rows to copy */ PANEL->jb, + /* (int) number of columns to copy in each row */ nn, + /* (double *) row-ordered source matrix */ data, + /* (int) leading dimension of source matrix */ ld, + /* (double *) row-ordered destination matrix */ PANEL->U, + /* (int) leading dimension of target matrix */ PANEL->ldu, + /* (int*) array of destination row indices */ rows, + /* (unsigned long long *) completion variable */ &completion_flags); + + wait_for(&completion_flags); + + return 0; +} Index: src/accel/HPL_accel_pgesv.c =================================================================== RCS file: src/accel/HPL_accel_pgesv.c diff -N src/accel/HPL_accel_pgesv.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/accel/HPL_accel_pgesv.c 20 Aug 2008 03:57:54 -0000 1.10 @@ -0,0 +1,196 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include "hpl.h" + +#include "HPL_accel_private.h" + +/* ---------------------------------------------------------------- */ +/* Global variables */ +/* ---------------------------------------------------------------- */ + +void *pgesv_mem; +void *pan_buf[2]; +int panel_prep; + +/* ---------------------------------------------------------------- */ +int HPL_accel_pgesv_init +( + HPL_T_grid *GRID, + HPL_T_palg *ALGO, + HPL_T_pmat *A +) +/* + * Purpose + * ======= + * + * HPL_accel_pgesv_init prepares the accelerator to participate in a pgesv + * computation. + * + * Arguments + * ========= + * + * GRID (local input) HPL_T_grid * + * On entry, GRID points to the data structure containing the + * process grid information. + * + * ALGO (local input) HPL_T_palg * + * On entry, ALGO points to a data structure containing the + * algorithm parameters. + * + * A (local input/output) HPL_T_pmat * + * On entry, A points to the data structure containing the local + * array information. + * + */ +{ + /* Initialize panel_prep to indicate that no panels have been prepped */ + panel_prep = -1; + + /* If this node does not contain any rows of the matrix, then just return */ + if (A->mp <= 0) { return 0; } + + TRACE_PRINT("%s [%d] entry myprow=%d mypcol=%d mp=%d nq=%d\n", + __FUNCTION__, GRID->iam, GRID->myrow, GRID->mycol, A->mp, A->nq); + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer( HPL_TIMING_ACCEL_OVERHEAD ); +#endif + + /* Create a scratch area for matrix reformatting */ + + int pan_size = A->ld * A->nb * sizeof(double); + int alloc_size = 3 * (pan_size + PAGESIZE); + +#ifdef HPL_USE_HUGE_PAGES + pgesv_mem = HPL_hpalloc(alloc_size); +#else + pgesv_mem = malloc(alloc_size); +#endif + assert ( pgesv_mem != NULL ); + + /* Reformat the matrix [a] from column-order to blocked format. */ + + int nq = A->nq; + double *mat_data = A->A; + + /* Pad out the number of columns to be a multiple of M_SUB */ + nq = ((nq+M_SUB-1)/M_SUB)*M_SUB; + + /* If this processor is in the first column of processors, don't reformat + the first block column since it needs to stay in column-major order for + panel factorization. */ + + if (GRID->mycol == 0) { + nq -= A->nb; + mat_data = &mat_data[INDEX_BLK(0,A->nb,M_SUB*A->ld)]; + + /* Columns up to panel_prep have been converted to column major */ + panel_prep += A->nb; + } + + if (nq > 0) { + double *scratch_buf = (double *) FIX_4GB_BOUNDARY_CROSSING( + ALIGN_PTR(pgesv_mem, PAGESIZE), + pan_size); + unsigned long long completion_flags; + + hpl_accel_reform_matrix_CL_to_B( + /* IN (int) Number of rows in matrix a */ A->mp, + /* IN (int) Number of cols in matrix a */ nq, + /* IN (double*) Matrix data in column-ordered, big-endian format + OUT Matrix data in blocked, big-endian format */ mat_data, + /* IN (int) Leading dimension of matrix a. The number of doubles to + advance from column n column n+1. */ A->ld, + /* IN (double*) Scratch area of at least 64*mp elems */ scratch_buf, + /* IN (int) size of scratch_buf in doubles */ pan_size/sizeof(double), + /* IN (unsigned long long *) Completion variable */ &completion_flags); + + wait_for(&completion_flags); + } + + /* Carve up allocated storage into buffers for pgesv computation */ + + void *free_area = pgesv_mem; + + int i; + for (i = 0; i<2; i++) { + pan_buf[i] = (double *) FIX_4GB_BOUNDARY_CROSSING( + ALIGN_PTR(free_area, PAGESIZE), + pan_size); + free_area = (void*)pan_buf[i] + pan_size; + } + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer( HPL_TIMING_ACCEL_OVERHEAD ); +#endif + + return 0; +} + + +/* ---------------------------------------------------------------- */ +int HPL_accel_pgesv_fini +( + HPL_T_grid *GRID, + HPL_T_palg *ALGO, + HPL_T_pmat *A +) +/* + * Purpose + * ======= + * + * HPL_accel_pgesv_fini cleans up the accelerator at the completion of a + * pgesv computation. + * + * Arguments + * ========= + * + * GRID (local input) HPL_T_grid * + * On entry, GRID points to the data structure containing the + * process grid information. + * + * ALGO (local input) HPL_T_palg * + * On entry, ALGO points to a data structure containing the + * algorithm parameters. + * + * A (local input/output) HPL_T_pmat * + * On entry, A points to the data structure containing the local + * array information. + */ +{ + /* If this node does not contain any rows of the matrix, then just return */ + if (A->mp <= 0) { return 0; } + + TRACE_PRINT("%s [%d] entry\n", __FUNCTION__, GRID->iam); + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer( HPL_TIMING_ACCEL_OVERHEAD ); +#endif + + /* A special case: when the last block column contains only the + right-hand-side vector, we must convert this from row-blocked + format to column major here. */ + + if ( (A->nq % A->nb) == 1) { + int x; + for (x=0; xmp; x++) { + A->A[INDEX_COL(x, A->nq-1, A->ld)] = A->A[INDEX_BLK(x, A->nq-1, M_SUB*A->ld)]; + } + } + +#ifdef HPL_USE_HUGE_PAGES + HPL_hpfree(pgesv_mem); +#else + free(pgesv_mem); +#endif + pgesv_mem = NULL; + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer( HPL_TIMING_ACCEL_OVERHEAD ); +#endif + + return 0; +} Index: src/accel/HPL_accel_private.h =================================================================== RCS file: src/accel/HPL_accel_private.h diff -N src/accel/HPL_accel_private.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/accel/HPL_accel_private.h 20 Aug 2008 03:57:54 -0000 1.6 @@ -0,0 +1,49 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#ifndef HPL_ACCEL_PRIVATE_H +#define HPL_ACCEL_PRIVATE_H + +/* ---------------------------------------------------------------- */ +/* Include files */ +/* ---------------------------------------------------------------- */ + +#include +#include "lib/hpl_accel.h" /* M_SUB */ +#include /* __mftb */ + +/* ---------------------------------------------------------------- */ +/* Global variables */ +/* ---------------------------------------------------------------- */ + +extern void *pan_buf[2]; +extern int panel_prep; + +/* ---------------------------------------------------------------- */ +/* Macros */ +/* ---------------------------------------------------------------- */ + +//#include /* printf */ + +#define TRACE_PRINT(s, ...) +//#define TRACE_PRINT printf + +#define PAGESIZE (4096) + +#define ALIGN_PTR(p,b) ( ( ((size_t)(p)+(b)-1) / (b) ) * (b) ) + +/* ---------------------------------------------------------------- */ +/* Inline functions */ +/* ---------------------------------------------------------------- */ + +static inline void wait_for(unsigned long long *completion_flags) +{ + volatile unsigned long long *flagptr = completion_flags; + + while ( *flagptr != 0 ) {} +} + + +#endif /* HPL_ACCEL_PRIVATE_H */ Index: src/accel/HPL_accel_rowget.c =================================================================== RCS file: src/accel/HPL_accel_rowget.c diff -N src/accel/HPL_accel_rowget.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/accel/HPL_accel_rowget.c 20 Aug 2008 03:57:54 -0000 1.3 @@ -0,0 +1,85 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include "hpl.h" +#include "HPL_accel_private.h" + +/* ---------------------------------------------------------------- */ +int HPL_accel_rowget +( + HPL_T_panel *PANEL, /* Panel structure for current panel */ + double *data, /* area to receive the data from the accelerator matrix (in row maj format) */ + int ld, /* leading dimension of data (amount to advance from row i to row i+1) */ + int numrows, /* number of rows to be copied from the accelerator matrix */ + int *rows, /* array of local row indices to be copied */ + int jj, /* local column index of first column to be copied into data */ + int nn /* number of columns of each row to be copied into data */ +) +/* + * Purpose + * ======= + * + * HPL_accel_rowget copies data from a block-formatted matrix into a buffer in + * row-major format. + * + * ----------------------------------------------------------------- + */ +{ + int i = 0; + + if (numrows<=0 || nn <=0) { return 0; } + + TRACE_PRINT("%s [%d] entry jj=%d numrows=%d nn=%d\n", + __FUNCTION__, PANEL->grid->iam, jj, numrows, nn); + + if ( panel_prep >= jj ) { + unsigned int x; + + for (i=0; i=0) { + double *dest = data + i*ld; + for (x=0; x<(unsigned int)nn; x++) { + dest[x] = PANEL->pmat->A[INDEX_COL(rows[i], jj+x, PANEL->lda)]; + } + } + } + + return 0; + } + + while (i=0)) { + j++; + } + + double *dest = data + i*ld; + + unsigned long long completion_flags; + + hpl_accel_reform_rows_B_to_R( + /* IN (int) Number of rows to copy */ j, + /* IN (int) Number of values (doubles) per row to copy */ nn, + /* OUT (double*) Buffer to receive row-formatted data */ dest, + /* IN (int) Leading dimension of the row buffer */ ld, + /* IN (double*) block formetted matrix */ PANEL->pmat->A, + /* IN (int) Leading dimension of matrix [a] */ M_SUB*PANEL->lda, + /* IN (int*) Array of row indices */ &rows[i], + /* IN (int) Starting [a] block matrix column offset */ jj, + /* IN (unsigned long long *) Completion variable */ &completion_flags); + + wait_for(&completion_flags); + + i+=j; + } + + return 0; +} Index: src/accel/HPL_accel_rowput.c =================================================================== RCS file: src/accel/HPL_accel_rowput.c diff -N src/accel/HPL_accel_rowput.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/accel/HPL_accel_rowput.c 20 Aug 2008 03:57:54 -0000 1.3 @@ -0,0 +1,84 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include "hpl.h" +#include "HPL_accel_private.h" + +/* ---------------------------------------------------------------- */ +int HPL_accel_rowput +( + HPL_T_panel *PANEL, /* Panel structure for current panel */ + double *data, /* area containing the data(in row maj format) to be copied to the accel */ + int ld, /* leading dimension of data (amount to advance from row i to row i+1) */ + int numrows, /* number of rows to be copied to the accelerator matrix */ + int *rows, /* array of local row indices to be copied */ + int jj, /* local column index of first column to be copied to accel */ + int nn /* number of columns of each row to be copied to accel */ +) +/* + * Purpose + * ======= + * + * HPL_accel_rowput copies data from a buffer in row-major format to a block-formatted matrix. + * + * ----------------------------------------------------------------- + */ +{ + int i = 0; + + if (numrows<=0 || nn <=0) { return 0; } + + TRACE_PRINT("%s [%d] entry jj=%d numrows=%d nn=%d\n", + __FUNCTION__, PANEL->grid->iam, jj, numrows, nn); + + if ( panel_prep >= jj ) { + unsigned int x; + + for (i=0; i=0) { + double *src = data + i*ld; + for (x=0; x<(unsigned int)nn; x++) { + PANEL->pmat->A[INDEX_COL(rows[i], jj+x, PANEL->lda)] = src[x]; + } + } + } + + return 0; + } + + while (i=0)) { + j++; + } + + double *src = data + i*ld; + + unsigned long long completion_flags; + + hpl_accel_reform_rows_R_to_B( + /* IN (int) Number of rows to copy */ j, + /* IN (int) Number of values (doubles) per row to copy */ nn, + /* IN (double*) Buffer containing row-formatted data */ src, + /* IN (int) Leading dimension of the row buffer */ ld, + /* OUT (double*) block formatted matrix */ PANEL->pmat->A, + /* IN (int) Leading dimension of matrix [a] */ M_SUB*PANEL->lda, + /* IN (int*) Array of row indices */ &rows[i], + /* IN (int) Starting [a] block matrix column offset */ jj, + /* IN (unsigned long long *) Completion variable */ &completion_flags); + + wait_for(&completion_flags); + + i += j; + } + + return 0; +} Index: src/accel/HPL_accel_swap.c =================================================================== RCS file: src/accel/HPL_accel_swap.c diff -N src/accel/HPL_accel_swap.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/accel/HPL_accel_swap.c 20 Aug 2008 03:57:54 -0000 1.8 @@ -0,0 +1,301 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include "hpl.h" +#include "HPL_accel_private.h" + +/* ---------------------------------------------------------------- */ +int HPL_accel_swap00N +( + HPL_T_panel *PANEL, /* Panel structure for current panel */ + const int *IPIV, /* Pivot vector */ + const int j1, /* Relative index of first column of matrix */ + const int nn /* local number of columns of each row to be swapped */ +) +/* + * Purpose + * ======= + * + * HPL_accel_swap00N swaps rows in a block-row formatted matrix. + * + * ----------------------------------------------------------------- + */ +{ + if (nn <=0) { return 0; } + + if ( panel_prep >= PANEL->jj ) { + HPL_dlaswp00N( PANEL->jb, nn, Mptr(PANEL->A, 0, j1, PANEL->lda), PANEL->lda, IPIV ); + return 0; + } + + TRACE_PRINT("%s [%d] entry ii=%d jj=%d j1=%d nn=%d\n", + __FUNCTION__, PANEL->grid->iam, PANEL->ii, PANEL->jj, j1, nn); + + int row_indx = INDEX_BLK(PANEL->ii, 0, M_SUB*PANEL->lda); + + unsigned long long completion_flags = 0; + + hpl_accel_swap_rows_B_to_B( + /* IN (int) number of rows to swap */ PANEL->jb, + /* IN (int) number of columns to swap in each row */ nn, + /* INOUT (double*) block-formatted matrix */ PANEL->pmat->A+row_indx, + /* IN (int) leading dimension for matrix */ M_SUB*PANEL->lda, + /* IN (int) array of row indices */ (int *)IPIV, + /* IN (int) starting column */ PANEL->jj+j1, + /* IN (unsigned long long *) Completion variable */ &completion_flags); + + wait_for(&completion_flags); + + return 0; +} + +/* ---------------------------------------------------------------- */ +int HPL_accel_swap01T +( + HPL_T_panel *PANEL, /* Panel structure for current panel */ + const int *LINDXA, /* row indices of source rows in A */ + const int *LINDXAU, /* row indices of dest rows in A or U */ + const int numrows, /* number of rows to copy */ + const int nn /* local number of columns of each row to be copied */ +) +/* + * Purpose + * ======= + * + * HPL_accel_swap01T copies rows from A into A and into U. + * + * ----------------------------------------------------------------- + */ +{ + int i, y; + + if (numrows <= 0 || nn <=0) { return 0; } + + TRACE_PRINT("%s [%d] entry ii=%d jj=%d numrows=%d nn=%d\n", + __FUNCTION__, PANEL->grid->iam, PANEL->ii, PANEL->jj, numrows, nn); + + double *A = PANEL->pmat->A; + int lda = M_SUB*PANEL->lda; + double *U = PANEL->U; + int ldu = PANEL->ldu; + + int blk_row = PANEL->ii, blk_col = PANEL->jj; + + for (i=0; i= 0) { + /* Copy source row into U */ + for (y=0; ygrid->iam, PANEL->ii, PANEL->jj, numrows, nn); + + double *A = PANEL->pmat->A; + int lda = M_SUB*PANEL->lda; + + int blk_row = PANEL->ii, blk_col = PANEL->jj; + + for (i=0; igrid->iam, PANEL->ii, PANEL->jj, numrows, nn); + + double *A = PANEL->pmat->A; + int lda = M_SUB*PANEL->lda; + double *U = PANEL->U; + int ldu = PANEL->ldu; + + int blk_row = PANEL->ii, blk_col = PANEL->jj; + + for (i=0; igrid->iam, PANEL->ii, PANEL->jj, numrows, nn); + + double *A = PANEL->pmat->A; + int lda = M_SUB*PANEL->lda; + double *U = PANEL->U; + int ldu = PANEL->ldu; + + int blk_row = PANEL->ii, blk_col = PANEL->jj; + + for (i=0; igrid->iam, PANEL->ii, PANEL->jj, numrows, nn); + + double *A = PANEL->pmat->A; + int lda = M_SUB*PANEL->lda; + double *U = PANEL->U; + int ldu = PANEL->ldu; + + int blk_row = PANEL->ii, blk_col = PANEL->jj; + + for (i=0; i +#include +#include +#include +#include +#include +#include +#include +#include + +#define MINIMUM_BUFFER_SIZE (8 + sizeof(hpalloc_hdr_t)) + + +typedef struct _hpalloc_hdr { + struct _hpalloc_hdr *next; + size_t size; +} hpalloc_hdr_t; + +static size_t hpsize=0; /* size of huge pages in bytes */ +static int hpseq=0; /* sequence number for huge page allocations */ +static hpalloc_hdr_t *heap=NULL; /* list of free buffers in the memory heap */ + +/* get_huge_pagesize + * ----------------- + * Parse /proc/meminfo for the size of huge pages. + */ +static size_t get_huge_pagesize() +{ + FILE *fp; + size_t size; + char *line = NULL; + size_t len = 0; + ssize_t chrs_read; + + if ((fp = fopen("/proc/meminfo", "r"))) { + while ((chrs_read = getline(&line, &len, fp)) != -1) { + if (sscanf(line, "Hugepagesize:%ld", &size) == 1) { + if (strstr(line, "kB")) { + size *= 1024; + } else if (strstr(line, "MB")) { + size *= 1024*1204; + } + break; + } + } + if (line) free(line); + fclose(fp); + } + return (size); +} + +/* allocate_from_heap + * ------------------ + * Allocate a buffer of the specified size from the huge page memory allocator. + * If there is insufficient memory, NULL is returned. + */ +static void *allocate_from_heap(size_t size) +{ + size_t *size_ptr; + hpalloc_hdr_t *ptr = heap; + hpalloc_hdr_t *prev = NULL; + hpalloc_hdr_t *ptr2; + + /* Scan the heap looking for a free buffer large enough + */ + while (ptr) { + if (ptr->size >= size + sizeof(size_t)) { + if (ptr->size >= (size + MINIMUM_BUFFER_SIZE)) { + /* Split the buffer in two, allocating off the front */ + ptr2 = (hpalloc_hdr_t *)((char *)ptr + size + sizeof(size_t)); + if (prev) prev->next = ptr2; + else heap = ptr2; + ptr2->next = ptr->next; + ptr2->size = ptr->size - (size + sizeof(size_t)); + } else { + /* Allocate the entire buffer block */ + if (prev) prev->next = ptr->next; + else heap = ptr->next; + size = ptr->size - sizeof(size_t); + } + size_ptr = (size_t *)ptr; + *size_ptr = size; + return ((void *)(size_ptr+1)); + } + prev = ptr; + ptr = ptr->next; + } + /* Failled allocating buffer */ + return NULL; +} + + +/* add_to_heap + * ----------- + * Add the memory buffer of bytes and specified to begin at + * to the heap allocator. + */ +static void add_to_heap(void *buffer, size_t size) +{ + hpalloc_hdr_t *ptr = heap; + hpalloc_hdr_t *prev = NULL; + hpalloc_hdr_t *buf; + + buf = (hpalloc_hdr_t *)buffer; + buf->size = size; + + /* Scan the heap looking for the appropriate insertion point. + */ + while ((buffer > (void *)ptr) && (ptr)) { + prev = ptr; + ptr = ptr->next; + } + + /* Insert the buffer into heap's free list. Coalesce the + * adjacent blocks, before and after. + */ + buf->next = ptr; + if (ptr) { + if (((char *)buf + buf->size) == (char *)(ptr)) { + /* Combine buf and ptr */ + buf->next = ptr->next; + buf->size += ptr->size; + } + } + if (prev) { + if (((char *)prev + prev->size) == (char *)(buf)) { + /* Combine prev and buf */ + prev->next = buf->next; + prev->size += buf->size; + } else { + prev->next = buf; + } + } else { + heap = buf; + } +} + +#if 1 +/* Diagnostic routines + */ +void dump_heap() +{ + hpalloc_hdr_t *ptr = heap; + + while (ptr) { + printf("HEAP %p %lld\n", ptr, (long long int)ptr->size); + ptr = ptr->next; + } +} +#endif + +/* HPL_hpalloc + * ------- + * Allocate a buffer of bytes from the huge page memory heap. + */ +void* HPL_hpalloc(size_t size) +{ + void *ptr; + int fmem; + char memfile[100]; + char msg[100]; + size_t bufsize; + + if (size == 0) return NULL; + + /* Pad the size to the next double word + */ + size = (size + 7) & ~7; + + /* See if allocation is available on the free list + */ + if ((ptr = allocate_from_heap(size))) return ptr; + + /* Not available, expand the heap and try again + */ + if (hpsize == 0) { + hpsize = get_huge_pagesize(); + if (hpsize == 0) { + perror("Failed locating huge page size"); + return NULL; + } + } + + bufsize = hpsize * (((sizeof(size_t) + size) + hpsize-1) / hpsize); + + sprintf(memfile, "/huge/linpack_%d_%d", getpid(), hpseq++); + if ((fmem = open(memfile, O_CREAT | O_RDWR, 0755)) == -1) { + sprintf(msg, "Failed opening file %s", memfile); + perror(msg); + return NULL; + } + remove(memfile); + if ((ptr = (void *)mmap(0, bufsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fmem, 0)) == MAP_FAILED) { + perror("Failed mmapping hugetlbs file"); + return NULL; + } + add_to_heap(ptr, bufsize); + return (allocate_from_heap(size)); +} + + + +/* HPL_hpfree + * -------- + * Free a buffer previously allocated using hgmalloc. + */ +void HPL_hpfree(void *ptr) +{ + if (ptr) { + ptr = (void *)(((size_t *)ptr)-1); + add_to_heap(ptr, *((size_t *)ptr) + sizeof(size_t)); + } +} + Index: src/blas/HPL_dgemm.c =================================================================== RCS file: /cvsroot/hpl_qs22/src/blas/HPL_dgemm.c,v retrieving revision 1.1 retrieving revision 1.7 diff -u -r1.1 -r1.7 --- src/blas/HPL_dgemm.c 10 Feb 2008 21:45:51 -0000 1.1 +++ src/blas/HPL_dgemm.c 26 Aug 2008 13:24:26 -0000 1.7 @@ -44,6 +44,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ /* * Include files */ @@ -406,6 +409,18 @@ * * --------------------------------------------------------------------- */ +#ifdef HPL_CALL_ACCEL + /* If this call can be performed on the accelerator, invoke + the accelerator DGEMM function. */ + if ( ((N & (4-1)) == 0) && ((K & (4-1)) == 0) + && ORDER == HplColumnMajor + && TRANSA == HplNoTrans && TRANSB == HplNoTrans + && ALPHA == -HPL_rone && BETA == HPL_rone ) + { + HPL_accel_dgemmCL(M, N, K, A, LDA, B, LDB, C, LDC); + return; + } +#endif #ifdef HPL_CALL_CBLAS cblas_dgemm( ORDER, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC ); Index: src/comm/HPL_allgatherv.c =================================================================== RCS file: src/comm/HPL_allgatherv.c diff -N src/comm/HPL_allgatherv.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/comm/HPL_allgatherv.c 7 Aug 2008 13:07:08 -0000 1.2 @@ -0,0 +1,206 @@ +/* ------------------------------------------------------------------ */ +/* (C) Copyright 2007 */ +/* International Business Machines Corporation, */ +/* */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------ */ + +#include "hpl.h" + +/* + * Purpose + * ======= + * + * HPL_allgatherv is an API-compatible wrapper for MPI_allgatherv, to + * allow experimentation with various implementations of allgatherv. + * + * Arguments + * ========= + * + * sendbuf (input) double * + * Address of send buffer for this rank. + * + * sendcount (input) int + * Number of elements in send buffer + * + * datatype (input) MPI_Datatype + * Datatype of send buffer elements (assumed to be MPI_DOUBLE) + * + * recvbuf (input/output) double * + * Address of receive buffer + * + * recvcounts (input) int * + * Array specifying the number of elements to receive from each + * participant in the Allgatherv + * + * displs (input) int * + * Array specifying the displacement into recvbuf at which to + * the data from each task should be placed. + * + * recvtype (input) MPI_Datatype + * Datatype of recv buffer elements (assumed to be MPI_DOUBLE) + * + * comm (input) MPI_Comm + * MPI Communicator on which the communication should flow. + * + * --------------------------------------------------------------------- + */ + +#define FANCY_ALLGATHER 1 + +int HPL_Allgatherv ( double *sendbuf, int sendcount, MPI_Datatype datatype, + double *recvbuf, int *recvcounts, int *displs, + MPI_Datatype recvtype, MPI_Comm comm ) +{ + int retval; +#ifdef FANCY_ALLGATHER + int how_many, i, j, leftover, my_rank, receiver, pointer, current_resource; + int total_size, average_size, hole, target_displ, recvidx, processor; + int *recvcounts2, *sendcounts2, *modified_displs; + int *modified_sendcount, *modified_recvcounts, *order_displs, *back_displs; + double *modified_sendbuf; + MPI_Status status_not_used; + int send_offset_table[100][100]; + int send_amount_table[100][100]; + + MPI_Comm_size(comm, &how_many); + MPI_Comm_rank(comm, &my_rank); + recvcounts2 = (int*) malloc(sizeof(int)*how_many); + sendcounts2 = (int*) malloc(sizeof(int)*how_many); + modified_sendcount = (int*) malloc(sizeof(int)*how_many); + modified_recvcounts = (int*) malloc(sizeof(int)*how_many); + modified_displs = (int*) malloc(sizeof(int)*how_many); + order_displs = (int*) malloc(sizeof(int)*how_many); + back_displs = (int*) malloc(sizeof(int)*how_many); + for(i = 0; i < how_many; i++) + { + back_displs[i] = displs[i]; + } + target_displ = 0; + for(i = 0; i < how_many; i++) + { + for(j = 0; j < how_many; j++) + { + if(displs[j] == target_displ) + { + order_displs[i] = j; + displs[j] = -1; + target_displ = target_displ+recvcounts[j]; + j = how_many; + } + } + } + for(i = 0; i < how_many; i++) + { + for(j = 0; j < how_many; j++) + { + send_amount_table[i][j] = 0; + send_offset_table[i][j] = 0; + } + } + total_size = 0; + for(i = 0; i < how_many; i++) + total_size += recvcounts[i]; + average_size = total_size/how_many; + leftover = total_size%how_many; // if the rank is < leftover, you get one extra thing to move + if(leftover > 0) + hole = average_size+1; + else hole = average_size; + processor = order_displs[0]; + if(leftover > processor) + hole=average_size+1; + else hole=average_size; + recvidx = 0; + receiver = order_displs[recvidx]; + pointer = 0; + // Missing ... if sender == receiver, don't send + for(i = 0; i < how_many; i++) + { + current_resource = recvcounts[order_displs[i]]; + while(current_resource > 0) + { + if(current_resource >= hole) + { + current_resource = current_resource - hole; + send_amount_table[order_displs[i]][receiver] = hole; + send_offset_table[order_displs[i]][receiver] = pointer; + pointer = pointer+hole; + recvidx++; + receiver = order_displs[recvidx]; + if(receiver < leftover) + hole = average_size+1; + else if(receiver < how_many) + hole = average_size; + else {hole = 0; break;} + } + else // if(current_resource > 0)??& if(current_resource < hole) + { + hole = hole - current_resource; + send_amount_table[order_displs[i]][receiver] = current_resource; + send_offset_table[order_displs[i]][receiver] = pointer; + pointer = pointer+current_resource; + current_resource = 0; + } + } + } + + for(i = 0; i < how_many; i++) // source + { + for(j = 0; j < how_many; j++) // destination + { + if((i != j) && (send_amount_table[i][j] != 0)) + { + if((my_rank == i) ) + { + MPI_Send((void*)&recvbuf[send_offset_table[i][j]], send_amount_table[i][j], datatype, j, i, comm ); + } + if((my_rank == j) ) + { + MPI_Recv((void*)&recvbuf[send_offset_table[i][j]], send_amount_table[i][j], datatype, i, i, comm, &status_not_used ); + } + } + } + } + + pointer = 0; + for(i = 0; i < how_many; i++) + { + processor = order_displs[i]; + if(processor < leftover) + { + modified_recvcounts[processor] = average_size+1; + modified_displs[processor] = pointer; + pointer = pointer+average_size+1; + modified_sendcount[processor] = average_size+1; + } + else // if(processor >= leftover) + { + modified_recvcounts[processor] = average_size; + modified_displs[processor] = pointer; + pointer = pointer+average_size; + modified_sendcount[processor] = average_size; + } + } + + modified_sendbuf = &recvbuf[modified_displs[my_rank]]; // It's "in place" ... so my send_buf is where I would receive if I were to receive my own data + retval = MPI_Allgatherv(modified_sendbuf, modified_sendcount[my_rank], datatype, + recvbuf, modified_recvcounts, modified_displs, recvtype, comm); + for(i = 0; i < how_many; i++) + { + displs[i] = back_displs[i]; + } + free(recvcounts2); + free(sendcounts2); + free(modified_sendcount); + free(modified_recvcounts); + free(modified_displs); + free(order_displs); + free(back_displs); +#else + + retval = MPI_Allgatherv(sendbuf, sendcount, datatype, + recvbuf, recvcounts, displs, recvtype, comm); + +#endif + return retval; +} Index: src/panel/HPL_pdpanel_free.c =================================================================== RCS file: /cvsroot/hpl_qs22/src/panel/HPL_pdpanel_free.c,v retrieving revision 1.1 retrieving revision 1.3 diff -u -r1.1 -r1.3 --- src/panel/HPL_pdpanel_free.c 10 Feb 2008 21:45:51 -0000 1.1 +++ src/panel/HPL_pdpanel_free.c 26 Aug 2008 13:24:26 -0000 1.3 @@ -44,6 +44,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ #include "hpl.h" #ifdef STDC_HEADERS @@ -94,7 +97,11 @@ vsip_blockdestroy_d( PANEL->Ublock ); #endif +#ifdef HPL_USE_HUGE_PAGES + if( PANEL->WORK ) HPL_hpfree( PANEL->WORK ); +#else if( PANEL->WORK ) free( PANEL->WORK ); +#endif if( PANEL->IWORK ) free( PANEL->IWORK ); return( MPI_SUCCESS ); Index: src/panel/HPL_pdpanel_init.c =================================================================== RCS file: /cvsroot/hpl_qs22/src/panel/HPL_pdpanel_init.c,v retrieving revision 1.1 retrieving revision 1.9 diff -u -r1.1 -r1.9 --- src/panel/HPL_pdpanel_init.c 10 Feb 2008 21:45:51 -0000 1.1 +++ src/panel/HPL_pdpanel_init.c 26 Aug 2008 13:24:26 -0000 1.9 @@ -44,6 +44,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ /* * Include files */ @@ -141,7 +144,7 @@ size_t dalign; int icurcol, icurrow, ii, itmp1, jj, lwork, ml2, mp, mycol, myrow, nb, npcol, nprow, - nq, nu; + nq, align, nu, uwork; /* .. * .. Executable Statements .. */ @@ -182,10 +185,11 @@ PANEL->pcol = icurcol; /* proc col owning 1st col of trailing A */ PANEL->msgid = TAG; /* message id to be used for panel bcast */ /* - * Initialize ldl2 and len to temporary dummy values and Update tag for - * next panel + * Initialize ldl2, ldu, and len to temporary dummy values and Update + * tag for next panel */ PANEL->ldl2 = 0; /* local leading dim of array L2 */ + PANEL->ldu = 0; /* local leading dim of array U */ PANEL->len = 0; /* length of the buffer to broadcast */ /* * Figure out the exact amount of workspace needed by the factorization @@ -201,15 +205,27 @@ * right after L2 (when it exist) so that one can receive a contiguous * buffer. */ + align = ALGO->align; dalign = ALGO->align * sizeof( double ); if( npcol == 1 ) /* P x 1 process grid */ { /* space for L1, DPIV, DINFO */ lwork = ALGO->align + ( PANEL->len = JB * JB + JB + 1 ); + uwork = 0; if( nprow > 1 ) /* space for U */ - { nu = nq - JB; lwork += JB * Mmax( 0, nu ); } + { + nu = Mmax( 0, nq - JB ); + /* To allow for alignment of either transposed or non-transposed + U panels, compute the size by padding both dimensions. */ + uwork = (((JB+align-1)/align)*align) * (((nu+align-1)/align)*align); + } + lwork += uwork; - if( !( PANEL->WORK = (void *)malloc( lwork * sizeof( double ) ) ) ) +#ifdef HPL_USE_HUGE_PAGES + if( !( PANEL->WORK = (void *)HPL_hpalloc( 2 * lwork * sizeof( double ) ) ) ) +#else + if( !( PANEL->WORK = (void *)malloc( 2 * lwork * sizeof( double ) ) ) ) +#endif { HPL_pabort( __LINE__, "HPL_pdpanel_init", "Memory allocation failed" ); @@ -220,27 +236,43 @@ */ PANEL->L2 = PANEL->A + ( myrow == icurrow ? JB : 0 ); PANEL->ldl2 = A->ld; - PANEL->L1 = (double *)HPL_PTR( PANEL->WORK, dalign ); + PANEL->L1 = (double *)FIX_4GB_BOUNDARY_CROSSING( + HPL_PTR( PANEL->WORK, dalign ), + JB * JB * sizeof(double) ); PANEL->DPIV = PANEL->L1 + JB * JB; PANEL->DINFO = PANEL->DPIV + JB; *(PANEL->DINFO) = 0.0; - PANEL->U = ( nprow > 1 ? PANEL->DINFO + 1: NULL ); + PANEL->U = ( nprow > 1 ? (double *)FIX_4GB_BOUNDARY_CROSSING( + HPL_PTR( PANEL->DINFO + 1, dalign ), + uwork * sizeof(double) ) + : NULL ); } else { /* space for L2, L1, DPIV */ ml2 = ( myrow == icurrow ? mp - JB : mp ); ml2 = Mmax( 0, ml2 ); PANEL->len = ml2*JB + ( itmp1 = JB*JB + JB + 1 ); + /* enforce alignment requirement on L2 panel */ + ml2 = ((ml2+align-1)/align)*align; #ifdef HPL_COPY_L - lwork = ALGO->align + PANEL->len; + PANEL->ldl2 = Mmax( 1, ml2 ); + lwork = ALGO->align + PANEL->ldl2*JB + itmp1; #else - lwork = ALGO->align + ( mycol == icurcol ? itmp1 : PANEL->len ); + PANEL->ldl2 = ( mycol == icurcol ) ? A->ld : Mmax( 1, ml2 ); + lwork = ALGO->align + ( mycol == icurcol ? 0 : PANEL->ldl2*JB ) + itmp1; #endif + uwork = 0; if( nprow > 1 ) /* space for U */ { - nu = ( mycol == icurcol ? nq - JB : nq ); - lwork += JB * Mmax( 0, nu ); + nu = Mmax( 0, ( mycol == icurcol ? nq - JB : nq) ); + /* To allow for alignment of either transposed or non-transposed + U panels, compute the size by padding both dimensions. */ + uwork = (((JB+align-1)/align)*align) * (((nu+align-1)/align)*align); } - - if( !( PANEL->WORK = (void *)malloc( lwork * sizeof( double ) ) ) ) + lwork += uwork; +#ifdef HPL_USE_HUGE_PAGES + if( !( PANEL->WORK = (void *)HPL_hpalloc( 2 * lwork * sizeof( double ) ) ) ) +#else + if( !( PANEL->WORK = (void *)malloc( 2 * lwork * sizeof( double ) ) ) ) +#endif { HPL_pabort( __LINE__, "HPL_pdpanel_init", "Memory allocation failed" ); @@ -250,26 +282,36 @@ * rent process column when HPL_COPY_L is not defined. */ #ifdef HPL_COPY_L - PANEL->L2 = (double *)HPL_PTR( PANEL->WORK, dalign ); - PANEL->ldl2 = Mmax( 1, ml2 ); - PANEL->L1 = PANEL->L2 + ml2 * JB; + PANEL->L2 = (double *)FIX_4GB_BOUNDARY_CROSSING( + HPL_PTR( PANEL->WORK, dalign ), + PANEL->ldl2 * JB * sizeof(double) ); + PANEL->L1 = (double *)FIX_4GB_BOUNDARY_CROSSING( + PANEL->L2 + PANEL->ldl2 * JB, + JB * JB * sizeof(double) ); #else if( mycol == icurcol ) { PANEL->L2 = PANEL->A + ( myrow == icurrow ? JB : 0 ); - PANEL->ldl2 = A->ld; - PANEL->L1 = (double *)HPL_PTR( PANEL->WORK, dalign ); + PANEL->L1 = (double *)FIX_4GB_BOUNDARY_CROSSING( + HPL_PTR( PANEL->WORK, dalign ), + JB * JB * sizeof(double) ); } else { - PANEL->L2 = (double *)HPL_PTR( PANEL->WORK, dalign ); - PANEL->ldl2 = Mmax( 1, ml2 ); - PANEL->L1 = PANEL->L2 + ml2 * JB; + PANEL->L2 = (double *)FIX_4GB_BOUNDARY_CROSSING( + HPL_PTR( PANEL->WORK, dalign ), + PANEL->ldl2 * JB * sizeof(double) ); + PANEL->L1 = (double *)FIX_4GB_BOUNDARY_CROSSING( + PANEL->L2 + PANEL->ldl2 * JB, + JB * JB * sizeof(double) ); } #endif PANEL->DPIV = PANEL->L1 + JB * JB; PANEL->DINFO = PANEL->DPIV + JB; *(PANEL->DINFO) = 0.0; - PANEL->U = ( nprow > 1 ? PANEL->DINFO + 1 : NULL ); + PANEL->U = ( nprow > 1 ? (double *)FIX_4GB_BOUNDARY_CROSSING( + HPL_PTR( PANEL->DINFO + 1, dalign ), + uwork * sizeof(double) ) + : NULL ); } #ifdef HPL_CALL_VSIPL PANEL->Ablock = A->block; Index: src/pfact/HPL_pdfact.c =================================================================== RCS file: /cvsroot/hpl_qs22/src/pfact/HPL_pdfact.c,v retrieving revision 1.1 retrieving revision 1.3 diff -u -r1.1 -r1.3 --- src/pfact/HPL_pdfact.c 10 Feb 2008 21:45:51 -0000 1.1 +++ src/pfact/HPL_pdfact.c 26 Aug 2008 13:24:26 -0000 1.3 @@ -44,6 +44,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ /* * Include files */ @@ -114,6 +117,10 @@ jb = PANEL->jb; PANEL->n -= jb; PANEL->ja += jb; if( ( PANEL->grid->mycol != PANEL->pcol ) || ( jb <= 0 ) ) return; +#ifdef HPL_CALL_ACCEL + /* Copy panel data from accel to host */ + HPL_accel_pangetL(PANEL); +#endif #ifdef HPL_DETAILED_TIMING HPL_ptimer( HPL_TIMING_RPFACT ); #endif Index: src/pgesv/HPL_pdgesv.c =================================================================== RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdgesv.c,v retrieving revision 1.1 retrieving revision 1.5 diff -u -r1.1 -r1.5 --- src/pgesv/HPL_pdgesv.c 10 Feb 2008 21:45:51 -0000 1.1 +++ src/pgesv/HPL_pdgesv.c 26 Aug 2008 13:24:26 -0000 1.5 @@ -44,6 +44,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ /* * Include files */ @@ -98,6 +101,10 @@ A->info = 0; +#ifdef HPL_CALL_ACCEL + (void) HPL_accel_pgesv_init(GRID, ALGO, A); +#endif + if( ( ALGO->depth == 0 ) || ( GRID->npcol == 1 ) ) { HPL_pdgesv0( GRID, ALGO, A ); @@ -106,6 +113,11 @@ { HPL_pdgesvK2( GRID, ALGO, A ); } + +#ifdef HPL_CALL_ACCEL + (void) HPL_accel_pgesv_fini(GRID, ALGO, A); +#endif + /* * Solve upper triangular system */ Index: src/pgesv/HPL_pdgesvK2.c =================================================================== RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdgesvK2.c,v retrieving revision 1.1 retrieving revision 1.5 diff -u -r1.1 -r1.5 --- src/pgesv/HPL_pdgesvK2.c 10 Feb 2008 21:45:51 -0000 1.1 +++ src/pgesv/HPL_pdgesvK2.c 26 Aug 2008 13:24:26 -0000 1.5 @@ -44,6 +44,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ /* * Include files */ @@ -176,10 +179,10 @@ nn = HPL_numrocI( jb, j, nb, nb, mycol, 0, npcol ); for( k = 0; k < depth; k++ ) /* partial updates 0..depth-1 */ (void) HPL_pdupdate( NULL, NULL, panel[k], nn ); - HPL_pdfact( panel[depth] ); /* factor current panel */ } else { nn = 0; } - /* Finish the latest update and broadcast the current panel */ + HPL_pdfact( panel[depth] ); /* factor current panel */ + /* Finish the latest update and broadcast the current panel */ (void) HPL_binit( panel[depth] ); HPL_pdupdate( panel[depth], &test, panel[0], nq-nn ); (void) HPL_bwait( panel[depth] ); Index: src/pgesv/HPL_pdlaswp00N.c =================================================================== RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdlaswp00N.c,v retrieving revision 1.1 retrieving revision 1.3 diff -u -r1.1 -r1.3 --- src/pgesv/HPL_pdlaswp00N.c 10 Feb 2008 21:45:51 -0000 1.1 +++ src/pgesv/HPL_pdlaswp00N.c 26 Aug 2008 13:24:26 -0000 1.3 @@ -44,6 +44,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ /* * Include files */ @@ -122,9 +125,8 @@ mydist, mydis_; int Cmsgid=MSGID_BEGIN_PFACT, Np2, align, hdim, i, icurrow, *iflag, ipA, ipW, *ipl, - iprow, jb, k, lda, ldW, myrow, n, nprow, - partner, root, size_, usize; -#define LDU jb + iprow, jb, k, lda, ldu, ldW, myrow, n, + nprow, partner, root, size_, usize; /* .. * .. Executable Statements .. */ @@ -144,8 +146,14 @@ comm = grid->col_comm; ip2 = (unsigned int)grid->row_ip2; hdim = grid->row_hdim; align = PANEL->algo->align; A = PANEL->A; U = PANEL->U; iflag = PANEL->IWORK; - lda = PANEL->lda; icurrow = PANEL->prow; usize = jb * n; - ldW = n + 1; + lda = PANEL->lda; icurrow = PANEL->prow; ldW = n + 1; + +/* + * pad leading dimension of U panel to get proper alignment + */ + ldu = ((jb+align-1)/align)*align; + PANEL->ldu = ldu; + usize = ldu * n; /* * Allocate space for temporary W (ldW * jb) */ @@ -189,7 +197,7 @@ */ if( myrow == icurrow ) { - HPL_dlaswp01N( ipA, n, A, lda, U, LDU, lindxA, lindxAU ); + HPL_dlaswp01N( ipA, n, A, lda, U, ldu, lindxA, lindxAU ); } else { @@ -251,7 +259,7 @@ (void) HPL_sdrv( U, usize, Cmsgid, W, llen[partner] * ldW, Cmsgid, partner, comm ); if( llen[partner] > 0 ) - HPL_dlaswp03N( llen[partner], n, U, LDU, W, W+1, ldW ); + HPL_dlaswp03N( llen[partner], n, U, ldu, W, W+1, ldW ); } else if( mydist == ip2 ) { /* I recv U for later Bcast, I send my W */ @@ -316,7 +324,7 @@ (void) HPL_sdrv( U, usize, Cmsgid, Mptr( W, 0, ipW, ldW ), llen[partner]*ldW, Cmsgid, partner, comm ); - HPL_dlaswp03N( llen[partner], n, U, LDU, Mptr( W, 0, ipW, + HPL_dlaswp03N( llen[partner], n, U, ldu, Mptr( W, 0, ipW, ldW ), Mptr( W, 1, ipW, ldW ), ldW ); ipW += llen[partner]; } @@ -324,7 +332,7 @@ { (void) HPL_sdrv( W, llen[myrow]*ldW, Cmsgid, U, usize, Cmsgid, partner, comm ); - HPL_dlaswp04N( ipA, llen[myrow], n, U, LDU, A, lda, W, + HPL_dlaswp04N( ipA, llen[myrow], n, U, ldu, A, lda, W, W+1, ldW, lindxA, lindxAU ); } } @@ -401,7 +409,7 @@ * Every process in [ip2..nprow) (relatively to icurrow) grabs its piece * of A. */ - HPL_dlaswp05N( ipA, n, A, lda, U, LDU, lindxA, lindxAU ); + HPL_dlaswp05N( ipA, n, A, lda, U, ldu, lindxA, lindxAU ); } /* * If nprow is not a power of 2, proc[i-ip2] sends global result to Index: src/pgesv/HPL_pdlaswp00T.c =================================================================== RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdlaswp00T.c,v retrieving revision 1.1 retrieving revision 1.4 diff -u -r1.1 -r1.4 --- src/pgesv/HPL_pdlaswp00T.c 10 Feb 2008 21:45:51 -0000 1.1 +++ src/pgesv/HPL_pdlaswp00T.c 26 Aug 2008 13:24:26 -0000 1.4 @@ -44,6 +44,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ /* * Include files */ @@ -122,9 +125,8 @@ mydist, mydis_; int Cmsgid=MSGID_BEGIN_PFACT, Np2, align, hdim, i, icurrow, *iflag, ipA, ipW, *ipl, - iprow, jb, k, lda, ldW, myrow, n, nprow, - partner, root, size_, usize; -#define LDU n + iprow, jb, k, lda, ldu, ldW, myrow, n, + nprow, partner, root, size_, usize; /* .. * .. Executable Statements .. */ @@ -144,8 +146,13 @@ comm = grid->col_comm; ip2 = (unsigned int)grid->row_ip2; hdim = grid->row_hdim; align = PANEL->algo->align; A = PANEL->A; U = PANEL->U; iflag = PANEL->IWORK; - lda = PANEL->lda; icurrow = PANEL->prow; usize = jb * n; - ldW = n + 1; + lda = PANEL->lda; icurrow = PANEL->prow; ldW = n + 1; +/* + * pad leading dimension of U panel to get proper alignment + */ + ldu = ((n+align-1)/align)*align; + PANEL->ldu = ldu; + usize = ldu * jb; /* * Allocate space for temporary W (ldW * jb) */ @@ -189,10 +196,20 @@ */ if( myrow == icurrow ) { - HPL_dlaswp01T( ipA, n, A, lda, U, LDU, lindxA, lindxAU ); +#ifdef HPL_CALL_ACCEL + if ( (PANEL->ja % (M_SUB*2)) == 0 ) + HPL_accel_swap01T( PANEL, lindxA, lindxAU, ipA, n ); + else +#endif + HPL_dlaswp01T( ipA, n, A, lda, U, ldu, lindxA, lindxAU ); } else { +#ifdef HPL_CALL_ACCEL + if ( (PANEL->ja % (M_SUB*2)) == 0 ) + HPL_accel_swap02N( PANEL, lindxA, lindxAU, ipA, W, W+1, ldW, n ); + else +#endif HPL_dlaswp02N( ipA, n, A, lda, W, W+1, ldW, lindxA, lindxAU ); } /* @@ -251,7 +268,7 @@ (void) HPL_sdrv( U, usize, Cmsgid, W, llen[partner] * ldW, Cmsgid, partner, comm ); if( llen[partner] > 0 ) - HPL_dlaswp03T( llen[partner], n, U, LDU, W, W+1, ldW ); + HPL_dlaswp03T( llen[partner], n, U, ldu, W, W+1, ldW ); } else if( mydist == ip2 ) { /* I recv U for later Bcast, I send my W */ @@ -316,7 +333,7 @@ (void) HPL_sdrv( U, usize, Cmsgid, Mptr( W, 0, ipW, ldW ), llen[partner]*ldW, Cmsgid, partner, comm ); - HPL_dlaswp03T( llen[partner], n, U, LDU, Mptr( W, 0, ipW, + HPL_dlaswp03T( llen[partner], n, U, ldu, Mptr( W, 0, ipW, ldW ), Mptr( W, 1, ipW, ldW ), ldW ); ipW += llen[partner]; } @@ -324,7 +341,12 @@ { (void) HPL_sdrv( W, llen[myrow]*ldW, Cmsgid, U, usize, Cmsgid, partner, comm ); - HPL_dlaswp04T( ipA, llen[myrow], n, U, LDU, A, lda, W, +#ifdef HPL_CALL_ACCEL + if ( (PANEL->ja % (M_SUB*2)) == 0 ) + HPL_accel_swap04T( PANEL, lindxA, lindxAU, ipA, llen[myrow], W, W+1, ldW, n ); + else +#endif + HPL_dlaswp04T( ipA, llen[myrow], n, U, ldu, A, lda, W, W+1, ldW, lindxA, lindxAU ); } } @@ -401,7 +423,12 @@ * Every process in [ip2..nprow) (relatively to icurrow) grabs its piece * of A. */ - HPL_dlaswp05T( ipA, n, A, lda, U, LDU, lindxA, lindxAU ); +#ifdef HPL_CALL_ACCEL + if ( (PANEL->ja % (M_SUB*2)) == 0 ) + HPL_accel_swap05T( PANEL, lindxA, lindxAU, ipA, n ); + else +#endif + HPL_dlaswp05T( ipA, n, A, lda, U, ldu, lindxA, lindxAU ); } /* * If nprow is not a power of 2, proc[i-ip2] sends global result to Index: src/pgesv/HPL_pdlaswp01N.c =================================================================== RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdlaswp01N.c,v retrieving revision 1.1 retrieving revision 1.3 diff -u -r1.1 -r1.3 --- src/pgesv/HPL_pdlaswp01N.c 10 Feb 2008 21:45:51 -0000 1.1 +++ src/pgesv/HPL_pdlaswp01N.c 26 Aug 2008 13:24:26 -0000 1.3 @@ -44,6 +44,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ /* * Include files */ @@ -120,8 +123,7 @@ * permU; static int equil=-1; int icurrow, * iflag, * ipA, * ipl, jb, k, - lda, myrow, n, nprow; -#define LDU jb + align, lda, ldu, myrow, n, nprow; /* .. * .. Executable Statements .. */ @@ -142,7 +144,12 @@ */ nprow = PANEL->grid->nprow; myrow = PANEL->grid->myrow; A = PANEL->A; U = PANEL->U; iflag = PANEL->IWORK; - lda = PANEL->lda; icurrow = PANEL->prow; + lda = PANEL->lda; icurrow = PANEL->prow; align = PANEL->algo->align; +/* + * pad leading dimension of U panel to get proper alignment + */ + ldu = ((jb+align-1)/align)*align; + PANEL->ldu = ldu; /* * Compute ipID (if not already done for this panel). lindxA and lindxAU * are of length at most 2*jb - iplen is of size nprow+1, ipmap, ipmapm1 @@ -178,11 +185,11 @@ * Copy into U the rows to be spread (local to icurrow) */ if( myrow == icurrow ) - { HPL_dlaswp01N( *ipA, n, A, lda, U, LDU, lindxA, lindxAU ); } + { HPL_dlaswp01N( *ipA, n, A, lda, U, ldu, lindxA, lindxAU ); } /* * Spread U - optionally probe for column panel */ - HPL_spreadN( PBCST, IFLAG, PANEL, HplRight, n, U, LDU, 0, iplen, + HPL_spreadN( PBCST, IFLAG, PANEL, HplRight, n, U, ldu, 0, iplen, ipmap, ipmapm1 ); /* * Local exchange (everywhere but in process row icurrow) @@ -191,22 +198,22 @@ { k = ipmapm1[myrow]; HPL_dlaswp06N( iplen[k+1]-iplen[k], n, A, lda, Mptr( U, iplen[k], - 0, LDU ), LDU, lindxA ); + 0, ldu ), ldu, lindxA ); } /* * Equilibration */ if( equil != 0 ) - HPL_equil( PBCST, IFLAG, PANEL, HplNoTrans, n, U, LDU, iplen, + HPL_equil( PBCST, IFLAG, PANEL, HplNoTrans, n, U, ldu, iplen, ipmap, ipmapm1, iwork ); /* * Rolling phase */ - HPL_rollN( PBCST, IFLAG, PANEL, n, U, LDU, iplen, ipmap, ipmapm1 ); + HPL_rollN( PBCST, IFLAG, PANEL, n, U, ldu, iplen, ipmap, ipmapm1 ); /* * Permute U in every process row */ - HPL_dlaswp00N( jb, n, U, LDU, permU ); + HPL_dlaswp00N( jb, n, U, ldu, permU ); #ifdef HPL_DETAILED_TIMING HPL_ptimer( HPL_TIMING_LASWP ); Index: src/pgesv/HPL_pdlaswp01T.c =================================================================== RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdlaswp01T.c,v retrieving revision 1.1 retrieving revision 1.4 diff -u -r1.1 -r1.4 --- src/pgesv/HPL_pdlaswp01T.c 10 Feb 2008 21:45:51 -0000 1.1 +++ src/pgesv/HPL_pdlaswp01T.c 26 Aug 2008 13:24:26 -0000 1.4 @@ -44,6 +44,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ /* * Include files */ @@ -120,8 +123,7 @@ * permU; static int equil=-1; int icurrow, * iflag, * ipA, * ipl, jb, k, - lda, myrow, n, nprow; -#define LDU n + align, lda, ldu, myrow, n, nprow; /* .. * .. Executable Statements .. */ @@ -142,7 +144,12 @@ */ nprow = PANEL->grid->nprow; myrow = PANEL->grid->myrow; A = PANEL->A; U = PANEL->U; iflag = PANEL->IWORK; - lda = PANEL->lda; icurrow = PANEL->prow; + lda = PANEL->lda; icurrow = PANEL->prow; align = PANEL->algo->align; +/* + * pad leading dimension of U panel to get proper alignment + */ + ldu = ((n+align-1)/align)*align; + PANEL->ldu = ldu; /* * Compute ipID (if not already done for this panel). lindxA and lindxAU * are of length at most 2*jb - iplen is of size nprow+1, ipmap, ipmapm1 @@ -178,11 +185,18 @@ * Copy into U the rows to be spread (local to icurrow) */ if( myrow == icurrow ) - { HPL_dlaswp01T( *ipA, n, A, lda, U, LDU, lindxA, lindxAU ); } + { +#ifdef HPL_CALL_ACCEL + if ( (PANEL->ja % (M_SUB*2)) == 0 ) + HPL_accel_swap01T( PANEL, lindxA, lindxAU, *ipA, n ); + else +#endif + HPL_dlaswp01T( *ipA, n, A, lda, U, ldu, lindxA, lindxAU ); + } /* * Spread U - optionally probe for column panel */ - HPL_spreadT( PBCST, IFLAG, PANEL, HplRight, n, U, LDU, 0, iplen, + HPL_spreadT( PBCST, IFLAG, PANEL, HplRight, n, U, ldu, 0, iplen, ipmap, ipmapm1 ); /* * Local exchange (everywhere but in process row icurrow) @@ -190,23 +204,28 @@ if( myrow != icurrow ) { k = ipmapm1[myrow]; +#ifdef HPL_CALL_ACCEL + if ( (PANEL->ja % (M_SUB*2)) == 0 ) + HPL_accel_swap06T( PANEL, lindxA, iplen[k+1]-iplen[k], iplen[k], n ); + else +#endif HPL_dlaswp06T( iplen[k+1]-iplen[k], n, A, lda, Mptr( U, 0, - iplen[k], LDU ), LDU, lindxA ); + iplen[k], ldu ), ldu, lindxA ); } /* * Equilibration */ if( equil != 0 ) - HPL_equil( PBCST, IFLAG, PANEL, HplTrans, n, U, LDU, iplen, ipmap, + HPL_equil( PBCST, IFLAG, PANEL, HplTrans, n, U, ldu, iplen, ipmap, ipmapm1, iwork ); /* * Rolling phase */ - HPL_rollT( PBCST, IFLAG, PANEL, n, U, LDU, iplen, ipmap, ipmapm1 ); + HPL_rollT( PBCST, IFLAG, PANEL, n, U, ldu, iplen, ipmap, ipmapm1 ); /* * Permute U in every process row */ - HPL_dlaswp10N( n, jb, U, LDU, permU ); + HPL_dlaswp10N( n, jb, U, ldu, permU ); #ifdef HPL_DETAILED_TIMING HPL_ptimer( HPL_TIMING_LASWP ); Index: src/pgesv/HPL_pdlaswp03T.c =================================================================== RCS file: src/pgesv/HPL_pdlaswp03T.c diff -N src/pgesv/HPL_pdlaswp03T.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/pgesv/HPL_pdlaswp03T.c 20 Aug 2008 18:23:35 -0000 1.8 @@ -0,0 +1,477 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include "hpl.h" + +/* + * Purpose + * ======= + * + * HPL_pdlaswp03T is an API-compatible replacement for the + * HPL_pdlaswp0xN functions which perform the NB row interchanges to + * NN columns of the trailing submatrix. The swap is performed using + * the MPI_Allgatherv and MPI_Scatterv collective communications APIs. + * + * Arguments + * ========= + * + * PBCST (local input/output) HPL_T_panel * + * On entry, PBCST points to the data structure containing the + * panel (to be broadcast) information, or NULL. + * + * IFLAG (local output) int * + * On exit, IFLAG indicates whether or not the broadcast has + * been completed when PBCST is not NULL on entry. In that case, + * IFLAG is left unchanged. + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel (to be updated) information. + * + * NN (local input) const int + * On entry, NN specifies the local number of columns of the + * trailing submatrix to be updated starting at the current + * position. NN must be at least zero. + * + * Note: PBCST is the panel that has just been factored and must be + * broadcast. PANEL is generally some block column in the matrix to the + * left of PBCST. The operations of broadcasting PBCST and updating PANEL + * are combined to allow the implementation to attempt to overlap them. + * + * ----------------------------------------------------------------- + */ + +void HPL_pdlaswp03T +( + HPL_T_panel * PBCST, + int * IFLAG, + HPL_T_panel * PANEL, + const int NN +) +{ + int my_prow = PANEL->grid->myrow; + int nprow = PANEL->grid->nprow; + int align = PANEL->algo->align; + int jb = PANEL->jb; + void *vptr = NULL; + double *B /* Buffer for MPI Collectives */; + int ldb; /* Leading dimension for B */ + int j; + + /* There is nothing to update, just return */ + if ( (NN <= 0) || (PANEL->jb <= 0) ) { return; } + + /* For simplicity, let's just do the bcast up front and get it out of + the way. */ + + /* TODO: MDK - Revisit whether this is what we want to do */ + if ( PBCST != NULL && *IFLAG == HPL_KEEP_TESTING ) { + do { (void) HPL_bcast( PBCST, IFLAG ); } + while( *IFLAG != HPL_SUCCESS ); + } + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer( HPL_TIMING_LASWP ); +#endif + + /* The accelerator uses a leading dimension of N for the B/U buffers. + For the non-accelerated case, we must use NN+1 because hpl's local + swap routines -- specifically HPL_dlaswp03T -- are designed to work + with this size. The extra double is used to hold the 'W0' array + passed in to HPL_dlaswp03T. */ + + /* Do the pivot on NN columns using MPI Collective Communications routines. + We need to do an Allgatherv to collect the winners on all processors + in the column and a Scatterv to move the losers down into the matrix, + and we have to pick which to do first. We'll do the AllGatherv + first, since the winners are needed for the DTRSM, which could + theoretically be done in parallel with the Scatterv of the losers, + which aren't needed until we start the DGEMM */ + + /* The pivot row information is contained in PANEL->DPIV, an array of jb + ints, where DPIV[j] specifies the global row index of the row to be + swapped with row j of the panel.*/ + + double *my_row_buffer; + + /* Note: The Allgatherv processing is essentially the same on all + processor rows. */ + + /* PANEL->IWORK has size (4 + 9*JB + 3*NPROW + MAX( 2*JB, NPROW+1 )) * sizeof(int) + and is intended to hold information used by pivot processing */ + + int *iflag; /* iflag indicates if a pivot info has already been + computed by a previous call to this function. + 0 => No , 1 => Yes */ + int *ipl; /* Length of ipID */ + int *ipID; /* Pivot pairs array computed by HPL_pipid. This array + is at most 4*N (N = PANEL->jb) elements in size. */ + int *winner_prow; /* Array of winner processor row numbers */ + int *my_winners; /* Array containing the local index of winner rows + on this row of processors */ + int *my_losers; /* Array containing the local index of rows that will + receive loser rows */ + int *WtoB; /* WtoB[j] is the row index of winner j in the + Allgathterv buffer. + This is computed in two parts -- first, the + offset *within* the block of rows contributed by + the winning row is computed. Then to this is added + the offset of the block of rows for the prow */ + int *BtoW; /* BtoW[j] is the row index of Allgathterv buffer that + will hold row j of the block row of winners. + This is computed as the inverse of WtoB. */ + int *LtoB; /* LtoB[j] is the row index of loser j in the + Scatter buffer */ + int *BtoL; /* BtoL[j] is the row index of Scatterv buffer that + will hold row j of the block row of losers. + This is computed as the inverse of LtoB. */ + int *prow_cnts; /* Array containing the count of winner rows for each + processor row -- will contribute to U */ + int *loser_cnts; /* Array containing the count of loser rows for each + processor row */ + int *my_loser_cnt; /* Count of rows on this processor row that will receive + loser rows */ + int *prowindx; /* Array containing the index [0..jb) of the first + row that each processor row contributes to U. This + is just the prefix sum of prow_cnts. */ + int *recvcounts; /* Array of receive counts for MPI_AllgatherV */ + int *displs; /* Array of displacements for MPI AllgatherV */ + + int *iwork_free_area = PANEL->IWORK; + + /* Allocate pivot info structures in PANEL->IWORK */ + iflag = iwork_free_area; iwork_free_area++; + ipl = iwork_free_area; iwork_free_area++; + ipID = iwork_free_area; iwork_free_area += 4*jb; + winner_prow = iwork_free_area; iwork_free_area += jb; + my_winners = iwork_free_area; iwork_free_area += jb; + my_losers = iwork_free_area; iwork_free_area += jb; + WtoB = iwork_free_area; iwork_free_area += jb; + LtoB = iwork_free_area; iwork_free_area += jb; + my_loser_cnt = iwork_free_area; iwork_free_area++; + prow_cnts = iwork_free_area; iwork_free_area += nprow; + prowindx = iwork_free_area; iwork_free_area += nprow; + recvcounts = iwork_free_area; iwork_free_area += nprow; + displs = iwork_free_area; iwork_free_area += nprow; + + /* Due to space constraints, we use some areas of the IWORK buffer for multiple + purposes. */ + + BtoW = ipID; /* BtoW shares the first jb entries of ipID */ + BtoL = ipID+jb; /* BtoW shares the second jb entries of ipID */ + + loser_cnts = recvcounts; /* loser_cnts shares storage with recvcounts */ + + /* Pad leading dimension of U panel to get proper alignment */ + PANEL->ldu = ((NN+align-1)/align)*align; + +#ifdef HPL_CALL_ACCEL + ldb = NN; /* Leading dimension for B */ +#else + ldb = NN+1; /* Leading dimension for B */ +#endif + + /* Allocate another row buffer basically the same size as U -- jb x NN. */ + +#ifdef HPL_USE_HUGE_PAGES + vptr = HPL_hpalloc( (align + jb*ldb) * sizeof(double) ); +#else + vptr = malloc( (align + jb*ldb) * sizeof(double) ); +#endif + + if (vptr == NULL) { + HPL_pabort( __LINE__, "HPL_pdlaswp03T", "Memory allocation failed." ); + } + + B = (double *)HPL_PTR(vptr, ((size_t)(align) * sizeof(double))); + + if (*iflag != 2) { /* pivot data not already computed */ + /* Initialize pivot_info in PANEL->IWORK */ + *iflag = 2; + + HPL_pipid(PANEL, ipl, ipID); + + for (j=0; jnb, PANEL->nb, 0, nprow ); + if ( winner_prow[j] == my_prow ) { + my_winners[prow_cnts[my_prow]] = local_index; + } + WtoB[j] = prow_cnts[winner_prow[j]]; + prow_cnts[winner_prow[j]] ++; + } + + prowindx[0] = 0; + for (j=1; jia; + int local_index, loser_prow; + /* ipID[2*j+1] is the destination of a loser row. Determine the processor + row that holds this loser (loser_prow). Also find local_index of this + row on that processor row. */ + Mindxg2lp( local_index, loser_prow, ipID[2*j+1], PANEL->nb, PANEL->nb, 0, nprow ); + if ( loser_prow == my_prow ) { + my_losers[loser_cnts[my_prow]] = local_index; + } + LtoB[loser_index] = prowindx[loser_prow] + loser_cnts[loser_prow]; + loser_cnts[loser_prow]++; + } + + *my_loser_cnt = loser_cnts[my_prow]; + + /* At this point, we're done with ipID, so we can overwrite it */ + + for (j=0; j=0) + BtoL[LtoB[j]] = j + PANEL->ii; + } + } + + /* Collect the winners in B. Once we have all the winners, we'll move + into the right positions of U. */ + + /* Step 1. Copy the winner rows from the matrix storage into + the appropriate position in B for the AllGatherV. */ + + my_row_buffer = &(B[prowindx[my_prow]*ldb]); + +#ifdef HPL_CALL_ACCEL + (void) HPL_accel_rowget (PANEL, my_row_buffer, ldb, + prow_cnts[my_prow], my_winners, PANEL->jj, NN); +#else + for (j=0; j8) { row_cnt = 8; } + for (k=0; kii; + } + HPL_dlaswp01T( + /* Number of rows of A to copy */ row_cnt, + /* Number of cols of A to copy */ NN, + /* Source of data to copy */ PANEL->A, + /* leading dimension of A */ PANEL->lda, + /* Taget of data copy */ my_row_buffer, + /* Leading dimension of U (B) (row major) */ ldb, + /* Local row indexes of A to be copied */ src, + /* Local row indexes of U (B) to receive the data */ dest ); + my_row_buffer += row_cnt*ldb; + } +#endif + + /* Step 2. Participate in the Allgatherv to collect the winners + into every processor in the column. */ + + my_row_buffer = &(B[prowindx[my_prow]*ldb]); + + int displ = 0; + for (j=0; jgrid->col_comm ); + HPL_ptimer( HPL_TIMING_ALLGATHER ); +#endif + MPI_Allgatherv( + /* IN (void*) starting address of send buffer */ my_row_buffer, + /* IN (int) number of elements in send buffer */ recvcounts[my_prow], + /* IN (MPI_Datatype) data type of send buffer elems */ MPI_DOUBLE, + /* OUT (void*) address of receive buffer */ B, + /* IN (int*) elems to receive from process[j] */ recvcounts, + /* IN (int*) loc in recv buf to store elems from process[j] */ displs, + /* IN (MPI_DATATYPE) data type of recv buffer elems */ MPI_DOUBLE, + /* IN (MPI_Comm) communicator */ PANEL->grid->col_comm ); +#ifdef HPL_DETAILED_TIMING + HPL_ptimer( HPL_TIMING_ALLGATHER ); +#endif + + /* Step 3. Copy the winners from the AllGather buffer (B) into their + correct positions in U. The data in row j of B must be copied to + row BtoW[j] of U. */ + +#ifdef HPL_CALL_ACCEL + + (void) HPL_accel_panputU(PANEL, B, ldb, BtoW, NN); + +#else + /* HPL_dlaswp03T requires a very odd format for the array of target offsets. + Firstly, it is an array of doubles rather than an array of ints. And + secondly it has the same leading dimension as the source array. So we + really have no choice but to use an extra column in the source array to + store these offsets. */ + + for (j=0; jU, + /* IN (int) Leading dimension of tgt array */ PANEL->ldu, + /* IN (double *) Array of target offsets */ &B[NN], + /* IN (double *) Source of data to copy */ B, + /* IN (int) Leading dimension of src array */ ldb ); + +#endif + + /* Now the AllGatherV is done ... so we move on to the ScatterV. */ + + /* If we are in the top row of the trailing A */ + if ( my_prow == PANEL->prow ) { + + /* Step 1. Copy the loser rows from the matrix storage into the + appropriate position in B for the Scatterv. */ + + /* The data in the local buffer will be in row-major, big + endian format. */ + +#ifdef HPL_CALL_ACCEL + (void) HPL_accel_rowget (PANEL, B, ldb, + jb, BtoL, PANEL->jj, NN ); +#else + /* This is a little tricky, since we must skip copying any + rows that are not actually losers. These are indicated + by BtoL[j] == -1. */ + + int num_losers = *ipl/2 - jb; + j = 0; + while (num_losers>0) + { + int src[8], dest[8]; + int row_cnt = 0; + + while ( (num_losers>0) && (row_cnt < 8) ) + { + if (BtoL[j] != -1) { + src[row_cnt] = BtoL[j] - PANEL->ii; + dest[row_cnt] = j; + num_losers--; + row_cnt++; + } + j++; + } + + HPL_dlaswp01T( + /* Number of rows of A to copy =*/ row_cnt, + /* Number of cols of A to copy =*/ NN, + /* Source of data to copy */ PANEL->A, + /* leading dimension of A */ PANEL->lda, + /* Taget of data copy */ B, + /* Leading dimension of U (B) (row major) */ ldb, + /* Local row indexes of A to be copied */ src, + /* Local row indexes of U (B) to receive the data */ dest ); + } +#endif + } + + /* Step 2. Scatter the loser rows out to their new home processors */ + + my_row_buffer = &(B[prowindx[my_prow]*ldb]); + + /* I'm pretty sure that the displs and recvcounts[j] used for the Allgatherv + will be exactly the same for the Scatterv, so just reuse them. */ + +#ifdef HPL_DETAILED_TIMING + MPI_Barrier ( PANEL->grid->col_comm ); + HPL_ptimer( HPL_TIMING_SCATTER ); +#endif + MPI_Scatterv( + /* IN (void*) address of send buffer */ B, + /* IN (int*) elemss to send to process [j] */ recvcounts, + /* IN (int*) loc in sendbuf holding elems for process[j] */ displs, + /* IN (MPI_DATATYPE) data type of send buffer elems */ MPI_DOUBLE, + /* OUT (void*) address of recv buffer */ my_row_buffer, + /* IN (int) number of elements in recv buffer */ recvcounts[my_prow], + /* IN (MPI_DATATYPE) data type of recv buffer elems */ MPI_DOUBLE, + /* IN (int) rank of sending process */ PANEL->prow, + /* IN (MPI_Comm) communicator */ PANEL->grid->col_comm ); +#ifdef HPL_DETAILED_TIMING + HPL_ptimer( HPL_TIMING_SCATTER ); +#endif + + /* Step 3. Copy the losers from the Scatter buffer into the matrix. */ + +#ifdef HPL_CALL_ACCEL + (void) HPL_accel_rowput (PANEL, my_row_buffer, ldb, + *my_loser_cnt, my_losers, PANEL->jj, NN); +#else + for (j=0; j8) { row_cnt = 8; } + for (k=0; kii; + } + HPL_dlaswp05T( + /* IN (int) Number of cols (rows) of U (B) to copy =*/ row_cnt, + /* IN (int) Number of rows (cols) of U (B) to copy =*/ NN, + /* OUT (double*) Target of data copy */ PANEL->A, + /* IN (int) leading dimension of A */ PANEL->lda, + /* IN (double*) Source of data to copy */ my_row_buffer, + /* IN (int) Leading dimension of U (B) (row major) */ ldb, + /* IN (int*) Local row indexes of A to receive the data */ dst, + /* IN (int*) Local col (row) indexes of U (B) to be copied */ src); + my_row_buffer += row_cnt*ldb; + } +#endif + +#ifdef HPL_USE_HUGE_PAGES + if ( vptr ) HPL_hpfree( vptr ); +#else + if ( vptr ) free( vptr ); +#endif + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer( HPL_TIMING_LASWP ); +#endif + + return; +} Index: src/pgesv/HPL_pdupdateNN.c =================================================================== RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdupdateNN.c,v retrieving revision 1.1 retrieving revision 1.3 diff -u -r1.1 -r1.3 --- src/pgesv/HPL_pdupdateNN.c 10 Feb 2008 21:45:51 -0000 1.1 +++ src/pgesv/HPL_pdupdateNN.c 26 Aug 2008 13:24:26 -0000 1.3 @@ -44,6 +44,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ /* * Include files */ @@ -105,11 +108,10 @@ #ifdef HPL_CALL_VSIPL vsip_mview_d * Av0, * Av1, * Lv0, * Lv1, * Uv0, * Uv1; #endif - int curr, i, iroff, jb, lda, ldl2, mp, n, nb, - nq0, nn, test; + int curr, i, iroff, jb, lda, ldl2, ldu, mp, n, + nb, nq0, nn, test; static int tswap = 0; static HPL_T_SWAP fswap = HPL_NO_SWP; -#define LDU jb /* .. * .. Executable Statements .. */ @@ -274,7 +276,7 @@ */ nq0 = 0; curr = ( PANEL->grid->myrow == PANEL->prow ? 1 : 0 ); Aptr = PANEL->A; L2ptr = PANEL->L2; L1ptr = PANEL->L1; - Uptr = PANEL->U; ldl2 = PANEL->ldl2; + Uptr = PANEL->U; ldl2 = PANEL->ldl2; ldu = PANEL->ldu; mp = PANEL->mp - ( curr != 0 ? jb : 0 ); #ifdef HPL_CALL_VSIPL /* @@ -288,7 +290,7 @@ */ Av0 = vsip_mbind_d( PANEL->Ablock, 0, 1, lda, lda, PANEL->pmat->nq ); Lv0 = vsip_mbind_d( PANEL->L2block, 0, 1, ldl2, ldl2, jb ); - Uv0 = vsip_mbind_d( PANEL->Ublock, 0, 1, LDU, LDU, n ); + Uv0 = vsip_mbind_d( PANEL->Ublock, 0, 1, ldu, ldu, n ); /* * Create the matrix subviews */ @@ -302,7 +304,7 @@ nn = n - nq0; nn = Mmin( nb, nn ); HPL_dtrsm( HplColumnMajor, HplLeft, HplLower, HplNoTrans, - HplUnit, jb, nn, HPL_rone, L1ptr, jb, Uptr, LDU ); + HplUnit, jb, nn, HPL_rone, L1ptr, jb, Uptr, ldu ); if( curr != 0 ) { #ifdef HPL_CALL_VSIPL @@ -321,10 +323,10 @@ (void) vsip_mdestroy_d( Uv1 ); #else HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, mp, nn, - jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone, + jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone, Mptr( Aptr, jb, 0, lda ), lda ); #endif - HPL_dlacpy( jb, nn, Uptr, LDU, Aptr, lda ); + HPL_dlacpy( jb, nn, Uptr, ldu, Aptr, lda ); } else { @@ -344,11 +346,11 @@ (void) vsip_mdestroy_d( Uv1 ); #else HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, mp, nn, - jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone, + jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone, Aptr, lda ); #endif } - Uptr = Mptr( Uptr, 0, nn, LDU ); + Uptr = Mptr( Uptr, 0, nn, ldu ); Aptr = Mptr( Aptr, 0, nn, lda ); nq0 += nn; (void) HPL_bcast( PBCST, &test ); @@ -359,7 +361,7 @@ if( ( nn = n - nq0 ) > 0 ) { HPL_dtrsm( HplColumnMajor, HplLeft, HplLower, HplNoTrans, - HplUnit, jb, nn, HPL_rone, L1ptr, jb, Uptr, LDU ); + HplUnit, jb, nn, HPL_rone, L1ptr, jb, Uptr, ldu ); if( curr != 0 ) { @@ -379,10 +381,10 @@ (void) vsip_mdestroy_d( Uv1 ); #else HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, mp, nn, - jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone, + jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone, Mptr( Aptr, jb, 0, lda ), lda ); #endif - HPL_dlacpy( jb, nn, Uptr, LDU, Aptr, lda ); + HPL_dlacpy( jb, nn, Uptr, ldu, Aptr, lda ); } else { @@ -402,7 +404,7 @@ (void) vsip_mdestroy_d( Uv1 ); #else HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, mp, nn, - jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone, + jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone, Aptr, lda ); #endif } Index: src/pgesv/HPL_pdupdateNT.c =================================================================== RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdupdateNT.c,v retrieving revision 1.1 retrieving revision 1.16 diff -u -r1.1 -r1.16 --- src/pgesv/HPL_pdupdateNT.c 10 Feb 2008 21:45:51 -0000 1.1 +++ src/pgesv/HPL_pdupdateNT.c 26 Aug 2008 13:24:26 -0000 1.16 @@ -44,10 +44,16 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ /* * Include files */ #include "hpl.h" +extern int panel_prep; + +#define OVERLAP_DGEMM_AND_BCAST 1 #ifdef STDC_HEADERS void HPL_pdupdateNT @@ -105,11 +111,10 @@ #ifdef HPL_CALL_VSIPL vsip_mview_d * Av0, * Av1, * Lv0, * Lv1, * Uv0, * Uv1; #endif - int curr, i, iroff, jb, lda, ldl2, mp, n, nb, - nq0, nn, test; + int curr, i, iroff, jb, lda, ldl2, ldu, mp, n, + nb, nq0, nn, test; static int tswap = 0; static HPL_T_SWAP fswap = HPL_NO_SWP; -#define LDU n /* .. * .. Executable Statements .. */ @@ -133,18 +138,22 @@ #endif return; } +#ifdef OVERLAP_DGEMM_AND_BCAST + test = HPL_KEEP_TESTING; +#else /* * Enable/disable the column panel probing mechanism */ (void) HPL_bcast( PBCST, &test ); +#endif /* * 1 x Q case */ if( PANEL->grid->nprow == 1 ) { - Aptr = PANEL->A; L2ptr = PANEL->L2; L1ptr = PANEL->L1; - ldl2 = PANEL->ldl2; dpiv = PANEL->DPIV; ipiv = PANEL->IWORK; - mp = PANEL->mp - jb; iroff = PANEL->ii; nq0 = 0; + Aptr = PANEL->A; L2ptr = PANEL->L2; L1ptr = PANEL->L1; + ldl2 = PANEL->ldl2; dpiv = PANEL->DPIV; ipiv = PANEL->IWORK; + mp = PANEL->mp - jb; iroff = PANEL->ii; nq0 = 0; #ifdef HPL_CALL_VSIPL /* * Admit the blocks @@ -162,6 +171,8 @@ Lv1 = vsip_msubview_d( Lv0, 0, 0, mp, jb ); #endif for( i = 0; i < jb; i++ ) { ipiv[i] = (int)(dpiv[i]) - iroff; } + +#ifndef OVERLAP_DGEMM_AND_BCAST /* * So far we have not updated anything - test availability of the panel * to be forwarded - If detected forward it and finish the update in one @@ -175,11 +186,21 @@ */ #ifdef HPL_DETAILED_TIMING HPL_ptimer( HPL_TIMING_LASWP ); - HPL_dlaswp00N( jb, nn, Aptr, lda, ipiv ); - HPL_ptimer( HPL_TIMING_LASWP ); +#endif +#ifdef HPL_CALL_ACCEL + HPL_accel_swap00N( PANEL, ipiv, nq0, nn ); #else HPL_dlaswp00N( jb, nn, Aptr, lda, ipiv ); #endif +#ifdef HPL_DETAILED_TIMING + HPL_ptimer( HPL_TIMING_LASWP ); +#endif + +#ifdef HPL_CALL_ACCEL + HPL_accel_dtrsm(PANEL, nq0, nn); + + HPL_accel_dgemm(PANEL, nq0, nn); +#else HPL_dtrsm( HplColumnMajor, HplLeft, HplLower, HplNoTrans, HplUnit, jb, nn, HPL_rone, L1ptr, jb, Aptr, lda ); #ifdef HPL_CALL_VSIPL @@ -197,14 +218,17 @@ (void) vsip_mdestroy_d( Av1 ); (void) vsip_mdestroy_d( Uv1 ); #else + HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, mp, nn, jb, -HPL_rone, L2ptr, ldl2, Aptr, lda, HPL_rone, Mptr( Aptr, jb, 0, lda ), lda ); #endif +#endif Aptr = Mptr( Aptr, 0, nn, lda ); nq0 += nn; (void) HPL_bcast( PBCST, &test ); } +#endif /* #ifndef OVERLAP_DGEMM_AND_BCAST */ /* * The panel has been forwarded at that point, finish the update */ @@ -212,11 +236,31 @@ { #ifdef HPL_DETAILED_TIMING HPL_ptimer( HPL_TIMING_LASWP ); +#endif +#ifdef HPL_CALL_ACCEL + HPL_accel_swap00N( PANEL, ipiv, nq0, nn ); +#else HPL_dlaswp00N( jb, nn, Aptr, lda, ipiv ); +#endif +#ifdef HPL_DETAILED_TIMING HPL_ptimer( HPL_TIMING_LASWP ); +#endif +#ifdef HPL_CALL_ACCEL + HPL_accel_dtrsm(PANEL, nq0, nn); + +#ifdef OVERLAP_DGEMM_AND_BCAST + HPL_accel_dgemm_async(PANEL, nq0, nn); + + if ( PBCST != NULL ) { + while( test != HPL_SUCCESS ) + { (void) HPL_bcast( PBCST, &test ); } + } + + HPL_accel_dgemm_wait(PANEL); #else - HPL_dlaswp00N( jb, nn, Aptr, lda, ipiv ); + HPL_accel_dgemm(PANEL, nq0, nn); #endif +#else HPL_dtrsm( HplColumnMajor, HplLeft, HplLower, HplNoTrans, HplUnit, jb, nn, HPL_rone, L1ptr, jb, Aptr, lda ); #ifdef HPL_CALL_VSIPL @@ -238,6 +282,7 @@ jb, -HPL_rone, L2ptr, ldl2, Aptr, lda, HPL_rone, Mptr( Aptr, jb, 0, lda ), lda ); #endif +#endif } #ifdef HPL_CALL_VSIPL /* @@ -267,6 +312,8 @@ if( ( fswap == HPL_SWAP01 ) || ( ( fswap == HPL_SW_MIX ) && ( n > tswap ) ) ) { HPL_pdlaswp01T( PBCST, &test, PANEL, n ); } + else if ( fswap == HPL_SWAP03 ) + { HPL_pdlaswp03T( PBCST, &test, PANEL, n ); } else { HPL_pdlaswp00T( PBCST, &test, PANEL, n ); } /* @@ -274,7 +321,7 @@ */ nq0 = 0; curr = ( PANEL->grid->myrow == PANEL->prow ? 1 : 0 ); Aptr = PANEL->A; L2ptr = PANEL->L2; L1ptr = PANEL->L1; - Uptr = PANEL->U; ldl2 = PANEL->ldl2; + Uptr = PANEL->U; ldl2 = PANEL->ldl2; ldu = PANEL->ldu; mp = PANEL->mp - ( curr != 0 ? jb : 0 ); #ifdef HPL_CALL_VSIPL /* @@ -288,12 +335,14 @@ */ Av0 = vsip_mbind_d( PANEL->Ablock, 0, 1, lda, lda, PANEL->pmat->nq ); Lv0 = vsip_mbind_d( PANEL->L2block, 0, 1, ldl2, ldl2, jb ); - Uv0 = vsip_mbind_d( PANEL->Ublock, 0, 1, LDU, LDU, jb ); + Uv0 = vsip_mbind_d( PANEL->Ublock, 0, 1, ldu, ldu, jb ); /* * Create the matrix subviews */ Lv1 = vsip_msubview_d( Lv0, 0, 0, mp, jb ); #endif + +#ifndef OVERLAP_DGEMM_AND_BCAST /* * Broadcast has not occured yet, spliting the computational part */ @@ -302,7 +351,7 @@ nn = n - nq0; nn = Mmin( nb, nn ); HPL_dtrsm( HplColumnMajor, HplRight, HplLower, HplTrans, - HplUnit, nn, jb, HPL_rone, L1ptr, jb, Uptr, LDU ); + HplUnit, nn, jb, HPL_rone, L1ptr, jb, Uptr, ldu ); if( curr != 0 ) { @@ -322,10 +371,10 @@ (void) vsip_mdestroy_d( Uv1 ); #else HPL_dgemm( HplColumnMajor, HplNoTrans, HplTrans, mp, nn, - jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone, + jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone, Mptr( Aptr, jb, 0, lda ), lda ); #endif - HPL_dlatcpy( jb, nn, Uptr, LDU, Aptr, lda ); + HPL_dlatcpy( jb, nn, Uptr, ldu, Aptr, lda ); } else { @@ -345,22 +394,39 @@ (void) vsip_mdestroy_d( Uv1 ); #else HPL_dgemm( HplColumnMajor, HplNoTrans, HplTrans, mp, nn, - jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone, + jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone, Aptr, lda ); #endif } - Uptr = Mptr( Uptr, nn, 0, LDU ); + Uptr = Mptr( Uptr, nn, 0, ldu ); Aptr = Mptr( Aptr, 0, nn, lda ); nq0 += nn; (void) HPL_bcast( PBCST, &test ); } +#endif /* #ifndef OVERLAP_DGEMM_AND_BCAST */ /* * The panel has been forwarded at that point, finish the update */ if( ( nn = n - nq0 ) > 0 ) { +#ifdef HPL_CALL_ACCEL + HPL_accel_dtrsm(PANEL, nq0, nn); + +#ifdef OVERLAP_DGEMM_AND_BCAST + HPL_accel_dgemm_async(PANEL, nq0, nn); + + if ( PBCST != NULL ) { + while( test != HPL_SUCCESS ) + { (void) HPL_bcast( PBCST, &test ); } + } + + HPL_accel_dgemm_wait(PANEL); +#else + HPL_accel_dgemm(PANEL, nq0, nn); +#endif +#else HPL_dtrsm( HplColumnMajor, HplRight, HplLower, HplTrans, - HplUnit, nn, jb, HPL_rone, L1ptr, jb, Uptr, LDU ); + HplUnit, nn, jb, HPL_rone, L1ptr, jb, Uptr, ldu ); if( curr != 0 ) { @@ -380,10 +446,10 @@ (void) vsip_mdestroy_d( Uv1 ); #else HPL_dgemm( HplColumnMajor, HplNoTrans, HplTrans, mp, nn, - jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone, + jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone, Mptr( Aptr, jb, 0, lda ), lda ); #endif - HPL_dlatcpy( jb, nn, Uptr, LDU, Aptr, lda ); + HPL_dlatcpy( jb, nn, Uptr, ldu, Aptr, lda ); } else { @@ -403,10 +469,11 @@ (void) vsip_mdestroy_d( Uv1 ); #else HPL_dgemm( HplColumnMajor, HplNoTrans, HplTrans, mp, nn, - jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone, + jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone, Aptr, lda ); #endif } +#endif /* !OVERLAP_DGEMM_AND_BCAST */ } #ifdef HPL_CALL_VSIPL /* @@ -428,7 +495,15 @@ #endif } +#ifdef OVERLAP_DGEMM_AND_BCAST + if ( PBCST != NULL ) { + while( test != HPL_SUCCESS ) + { (void) HPL_bcast( PBCST, &test ); } + } +#endif + PANEL->A = Mptr( PANEL->A, 0, n, lda ); PANEL->nq -= n; PANEL->jj += n; + PANEL->ja += n; /* * return the outcome of the probe (should always be HPL_SUCCESS, the * panel broadcast is enforced in that routine). Index: src/pgesv/HPL_pdupdateTN.c =================================================================== RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdupdateTN.c,v retrieving revision 1.1 retrieving revision 1.3 diff -u -r1.1 -r1.3 --- src/pgesv/HPL_pdupdateTN.c 10 Feb 2008 21:45:51 -0000 1.1 +++ src/pgesv/HPL_pdupdateTN.c 26 Aug 2008 13:24:26 -0000 1.3 @@ -44,6 +44,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ /* * Include files */ @@ -105,11 +108,10 @@ #ifdef HPL_CALL_VSIPL vsip_mview_d * Av0, * Av1, * Lv0, * Lv1, * Uv0, * Uv1; #endif - int curr, i, iroff, jb, lda, ldl2, mp, n, nb, - nq0, nn, test; + int curr, i, iroff, jb, lda, ldl2, ldu, mp, n, + nb, nq0, nn, test; static int tswap = 0; static HPL_T_SWAP fswap = HPL_NO_SWP; -#define LDU jb /* .. * .. Executable Statements .. */ @@ -274,7 +276,7 @@ */ nq0 = 0; curr = ( PANEL->grid->myrow == PANEL->prow ? 1 : 0 ); Aptr = PANEL->A; L2ptr = PANEL->L2; L1ptr = PANEL->L1; - Uptr = PANEL->U; ldl2 = PANEL->ldl2; + Uptr = PANEL->U; ldl2 = PANEL->ldl2; ldu = PANEL->ldu; mp = PANEL->mp - ( curr != 0 ? jb : 0 ); #ifdef HPL_CALL_VSIPL /* @@ -288,7 +290,7 @@ */ Av0 = vsip_mbind_d( PANEL->Ablock, 0, 1, lda, lda, PANEL->pmat->nq ); Lv0 = vsip_mbind_d( PANEL->L2block, 0, 1, ldl2, ldl2, jb ); - Uv0 = vsip_mbind_d( PANEL->Ublock, 0, 1, LDU, LDU, n ); + Uv0 = vsip_mbind_d( PANEL->Ublock, 0, 1, ldu, ldu, n ); /* * Create the matrix subviews */ @@ -302,7 +304,7 @@ nn = n - nq0; nn = Mmin( nb, nn ); HPL_dtrsm( HplColumnMajor, HplLeft, HplUpper, HplTrans, - HplUnit, jb, nn, HPL_rone, L1ptr, jb, Uptr, LDU ); + HplUnit, jb, nn, HPL_rone, L1ptr, jb, Uptr, ldu ); if( curr != 0 ) { @@ -322,10 +324,10 @@ (void) vsip_mdestroy_d( Uv1 ); #else HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, mp, nn, - jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone, + jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone, Mptr( Aptr, jb, 0, lda ), lda ); #endif - HPL_dlacpy( jb, nn, Uptr, LDU, Aptr, lda ); + HPL_dlacpy( jb, nn, Uptr, ldu, Aptr, lda ); } else { @@ -345,11 +347,11 @@ (void) vsip_mdestroy_d( Uv1 ); #else HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, mp, nn, - jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone, + jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone, Aptr, lda ); #endif } - Uptr = Mptr( Uptr, 0, nn, LDU ); + Uptr = Mptr( Uptr, 0, nn, ldu ); Aptr = Mptr( Aptr, 0, nn, lda ); nq0 += nn; (void) HPL_bcast( PBCST, &test ); @@ -360,7 +362,7 @@ if( ( nn = n - nq0 ) > 0 ) { HPL_dtrsm( HplColumnMajor, HplLeft, HplUpper, HplTrans, - HplUnit, jb, nn, HPL_rone, L1ptr, jb, Uptr, LDU ); + HplUnit, jb, nn, HPL_rone, L1ptr, jb, Uptr, ldu ); if( curr != 0 ) { @@ -380,10 +382,10 @@ (void) vsip_mdestroy_d( Uv1 ); #else HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, mp, nn, - jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone, + jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone, Mptr( Aptr, jb, 0, lda ), lda ); #endif - HPL_dlacpy( jb, nn, Uptr, LDU, Aptr, lda ); + HPL_dlacpy( jb, nn, Uptr, ldu, Aptr, lda ); } else { @@ -403,7 +405,7 @@ (void) vsip_mdestroy_d( Uv1 ); #else HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, mp, nn, - jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone, + jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone, Aptr, lda ); #endif } Index: src/pgesv/HPL_pdupdateTT.c =================================================================== RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_pdupdateTT.c,v retrieving revision 1.1 retrieving revision 1.3 diff -u -r1.1 -r1.3 --- src/pgesv/HPL_pdupdateTT.c 10 Feb 2008 21:45:51 -0000 1.1 +++ src/pgesv/HPL_pdupdateTT.c 26 Aug 2008 13:24:26 -0000 1.3 @@ -44,6 +44,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ /* * Include files */ @@ -105,11 +108,10 @@ #ifdef HPL_CALL_VSIPL vsip_mview_d * Av0, * Av1, * Lv0, * Lv1, * Uv0, * Uv1; #endif - int curr, i, iroff, jb, lda, ldl2, mp, n, nb, - nq0, nn, test; + int curr, i, iroff, jb, lda, ldl2, ldu, mp, n, + nb, nq0, nn, test; static int tswap = 0; static HPL_T_SWAP fswap = HPL_NO_SWP; -#define LDU n /* .. * .. Executable Statements .. */ @@ -274,7 +276,7 @@ */ nq0 = 0; curr = ( PANEL->grid->myrow == PANEL->prow ? 1 : 0 ); Aptr = PANEL->A; L2ptr = PANEL->L2; L1ptr = PANEL->L1; - Uptr = PANEL->U; ldl2 = PANEL->ldl2; + Uptr = PANEL->U; ldl2 = PANEL->ldl2; ldu = PANEL->ldu; mp = PANEL->mp - ( curr != 0 ? jb : 0 ); #ifdef HPL_CALL_VSIPL /* @@ -288,7 +290,7 @@ */ Av0 = vsip_mbind_d( PANEL->Ablock, 0, 1, lda, lda, PANEL->pmat->nq ); Lv0 = vsip_mbind_d( PANEL->L2block, 0, 1, ldl2, ldl2, jb ); - Uv0 = vsip_mbind_d( PANEL->Ublock, 0, 1, LDU, LDU, jb ); + Uv0 = vsip_mbind_d( PANEL->Ublock, 0, 1, ldu, ldu, jb ); /* * Create the matrix subviews */ @@ -302,7 +304,7 @@ nn = n - nq0; nn = Mmin( nb, nn ); HPL_dtrsm( HplColumnMajor, HplRight, HplUpper, HplNoTrans, - HplUnit, nn, jb, HPL_rone, L1ptr, jb, Uptr, LDU ); + HplUnit, nn, jb, HPL_rone, L1ptr, jb, Uptr, ldu ); if( curr != 0 ) { @@ -322,10 +324,10 @@ (void) vsip_mdestroy_d( Uv1 ); #else HPL_dgemm( HplColumnMajor, HplNoTrans, HplTrans, mp, nn, - jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone, + jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone, Mptr( Aptr, jb, 0, lda ), lda ); #endif - HPL_dlatcpy( jb, nn, Uptr, LDU, Aptr, lda ); + HPL_dlatcpy( jb, nn, Uptr, ldu, Aptr, lda ); } else { @@ -345,11 +347,11 @@ (void) vsip_mdestroy_d( Uv1 ); #else HPL_dgemm( HplColumnMajor, HplNoTrans, HplTrans, mp, nn, - jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone, + jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone, Aptr, lda ); #endif } - Uptr = Mptr( Uptr, nn, 0, LDU ); + Uptr = Mptr( Uptr, nn, 0, ldu ); Aptr = Mptr( Aptr, 0, nn, lda ); nq0 += nn; (void) HPL_bcast( PBCST, &test ); @@ -360,7 +362,7 @@ if( ( nn = n - nq0 ) > 0 ) { HPL_dtrsm( HplColumnMajor, HplRight, HplUpper, HplNoTrans, - HplUnit, nn, jb, HPL_rone, L1ptr, jb, Uptr, LDU ); + HplUnit, nn, jb, HPL_rone, L1ptr, jb, Uptr, ldu ); if( curr != 0 ) { @@ -380,10 +382,10 @@ (void) vsip_mdestroy_d( Uv1 ); #else HPL_dgemm( HplColumnMajor, HplNoTrans, HplTrans, mp, nn, - jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone, + jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone, Mptr( Aptr, jb, 0, lda ), lda ); #endif - HPL_dlatcpy( jb, nn, Uptr, LDU, Aptr, lda ); + HPL_dlatcpy( jb, nn, Uptr, ldu, Aptr, lda ); } else { @@ -403,7 +405,7 @@ (void) vsip_mdestroy_d( Uv1 ); #else HPL_dgemm( HplColumnMajor, HplNoTrans, HplTrans, mp, nn, - jb, -HPL_rone, L2ptr, ldl2, Uptr, LDU, HPL_rone, + jb, -HPL_rone, L2ptr, ldl2, Uptr, ldu, HPL_rone, Aptr, lda ); #endif } Index: src/pgesv/HPL_rollT.c =================================================================== RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_rollT.c,v retrieving revision 1.1 retrieving revision 1.4 diff -u -r1.1 -r1.4 --- src/pgesv/HPL_rollT.c 10 Feb 2008 21:45:51 -0000 1.1 +++ src/pgesv/HPL_rollT.c 26 Aug 2008 13:24:26 -0000 1.4 @@ -44,6 +44,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ /* * Include files */ @@ -137,7 +140,7 @@ /* * .. Local Variables .. */ -#if 0 +#ifdef HPL_USE_MPI_DATATYPE MPI_Datatype type[2]; #endif MPI_Status status; @@ -182,7 +185,7 @@ if( lengthR > 0 ) { -#if 0 +#ifdef HPL_USE_MPI_DATATYPE if( ierr == MPI_SUCCESS ) { if( LDU == N ) @@ -209,7 +212,7 @@ if( lengthS > 0 ) { -#if 0 +#ifdef HPL_USE_MPI_DATATYPE if( ierr == MPI_SUCCESS ) { if( LDU == N ) @@ -240,7 +243,7 @@ { if( ierr == MPI_SUCCESS ) ierr = MPI_Wait( &request, &status ); -#if 0 +#ifdef HPL_USE_MPI_DATATYPE if( ierr == MPI_SUCCESS ) ierr = MPI_Type_free( &type[I_RECV] ); #endif Index: src/pgesv/HPL_spreadT.c =================================================================== RCS file: /cvsroot/hpl_qs22/src/pgesv/HPL_spreadT.c,v retrieving revision 1.1 retrieving revision 1.4 diff -u -r1.1 -r1.4 --- src/pgesv/HPL_spreadT.c 10 Feb 2008 21:45:51 -0000 1.1 +++ src/pgesv/HPL_spreadT.c 26 Aug 2008 13:24:26 -0000 1.4 @@ -44,6 +44,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ /* * Include files */ @@ -153,7 +156,7 @@ /* * .. Local Variables .. */ -#if 0 +#ifdef HPL_USE_MPI_DATATYPE MPI_Datatype type; #endif MPI_Status status; @@ -194,7 +197,7 @@ if( mydist & ip2 ) { -#if 0 +#ifdef HPL_USE_MPI_DATATYPE if( ierr == MPI_SUCCESS ) { if( LDU == N ) @@ -224,7 +227,7 @@ } else if( partner < nprow ) { -#if 0 +#ifdef HPL_USE_MPI_DATATYPE if( ierr == MPI_SUCCESS ) { if( LDU == N ) @@ -293,7 +296,7 @@ if( mydist & ip2 ) { -#if 0 +#ifdef HPL_USE_MPI_DATATYPE if( ierr == MPI_SUCCESS ) { if( LDU == N ) @@ -323,7 +326,7 @@ } else if( partner < nprow ) { -#if 0 +#ifdef HPL_USE_MPI_DATATYPE if( ierr == MPI_SUCCESS ) { if( LDU == N ) Index: testing/ptest/HPL.dat =================================================================== RCS file: /cvsroot/hpl_qs22/testing/ptest/HPL.dat,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- testing/ptest/HPL.dat 10 Feb 2008 21:45:52 -0000 1.1 +++ testing/ptest/HPL.dat 27 Apr 2008 23:55:48 -0000 1.2 @@ -23,7 +23,7 @@ 0 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM) 1 # of lookahead depth 0 DEPTHs (>=0) -2 SWAP (0=bin-exch,1=long,2=mix) +2 SWAP (0=bin-exch,1=long,2=mix,3=MPI-coll) 64 swapping threshold 0 L1 in (0=transposed,1=no-transposed) form 0 U in (0=transposed,1=no-transposed) form Index: testing/ptest/HPL_pddriver.c =================================================================== RCS file: /cvsroot/hpl_qs22/testing/ptest/HPL_pddriver.c,v retrieving revision 1.1 retrieving revision 1.3 diff -u -r1.1 -r1.3 --- testing/ptest/HPL_pddriver.c 10 Feb 2008 21:45:52 -0000 1.1 +++ testing/ptest/HPL_pddriver.c 26 Aug 2008 13:24:26 -0000 1.3 @@ -44,6 +44,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ /* * Include files */ @@ -112,6 +115,9 @@ #endif MPI_Comm_rank( MPI_COMM_WORLD, &rank ); MPI_Comm_size( MPI_COMM_WORLD, &size ); +#ifdef HPL_CALL_ACCEL + HPL_accel_init(rank); +#endif /* * Read and check validity of test parameters from input file * @@ -280,6 +286,9 @@ if( ( test.outfp != stdout ) && ( test.outfp != stderr ) ) (void) fclose( test.outfp ); } +#ifdef HPL_CALL_ACCEL + HPL_accel_exit(rank); +#endif #ifdef HPL_CALL_VSIPL vsip_finalize((void*)0); #endif Index: testing/ptest/HPL_pdinfo.c =================================================================== RCS file: /cvsroot/hpl_qs22/testing/ptest/HPL_pdinfo.c,v retrieving revision 1.1 retrieving revision 1.7 diff -u -r1.1 -r1.7 --- testing/ptest/HPL_pdinfo.c 10 Feb 2008 21:45:52 -0000 1.1 +++ testing/ptest/HPL_pdinfo.c 26 Aug 2008 13:24:26 -0000 1.7 @@ -44,6 +44,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ /* * Include files */ @@ -368,6 +371,15 @@ "Value of NB less than 1" ); error = 1; goto label_error; } +#ifdef HPL_CALL_ACCEL + /* Accelerator is hard-coded for NB=128 */ + if( NB[ i ] != 128 ) + { + HPL_pwarn( stderr, __LINE__, "HPL_pdinfo", + "Value of NB must be 128 for hybrid architecture" ); + error = 1; goto label_error; + } +#endif } /* * Process grids, mapping, (>=1) (P, Q) @@ -565,13 +577,14 @@ } } /* - * Swapping algorithm (0,1 or 2) (FSWAP) + * Swapping algorithm (0,1,2 or 3) (FSWAP) */ (void) fgets( line, HPL_LINE_MAX - 2, infp ); (void) sscanf( line, "%s", num ); j = atoi( num ); if( j == 0 ) *FSWAP = HPL_SWAP00; else if( j == 1 ) *FSWAP = HPL_SWAP01; else if( j == 2 ) *FSWAP = HPL_SW_MIX; + else if( j == 3 ) *FSWAP = HPL_SWAP03; else *FSWAP = HPL_SWAP01; /* * Swapping threshold (>=0) (TSWAP) @@ -585,12 +598,30 @@ (void) fgets( line, HPL_LINE_MAX - 2, infp ); (void) sscanf( line, "%s", num ); *L1NOTRAN = atoi( num ); if( ( *L1NOTRAN != 0 ) && ( *L1NOTRAN != 1 ) ) *L1NOTRAN = 0; +#ifdef HPL_CALL_ACCEL + /* Accelerator code paths currently only implemented for L1 no-transposed */ + if ( *L1NOTRAN != 1 ) + { + HPL_pwarn( stderr, __LINE__, "HPL_pdinfo", + "L transposed is not currently supported for hybrid architecture" ); + error = 1; goto label_error; + } +#endif /* * U in (no-)transposed form (0 or 1) */ (void) fgets( line, HPL_LINE_MAX - 2, infp ); (void) sscanf( line, "%s", num ); *UNOTRAN = atoi( num ); if( ( *UNOTRAN != 0 ) && ( *UNOTRAN != 1 ) ) *UNOTRAN = 0; +#ifdef HPL_CALL_ACCEL + /* Accelerator code paths currently only implemented for U transposed */ + if( *UNOTRAN != 0 ) + { + HPL_pwarn( stderr, __LINE__, "HPL_pdinfo", + "U no-transposed is not currently supported for hybrid architecture" ); + error = 1; goto label_error; + } +#endif /* * Equilibration (0=no, 1=yes) */ @@ -603,6 +634,11 @@ (void) fgets( line, HPL_LINE_MAX - 2, infp ); (void) sscanf( line, "%s", num ); *ALIGN = atoi( num ); if( *ALIGN <= 0 ) *ALIGN = 4; +#ifdef HPL_CALL_ACCEL + /* Accelerator is hard-coded for ALIGN=64 */ + *ALIGN = 64; +#endif + /* * Close input file */ @@ -703,6 +739,7 @@ if( *FSWAP == HPL_SWAP00 ) iwork[j] = 0; else if( *FSWAP == HPL_SWAP01 ) iwork[j] = 1; else if( *FSWAP == HPL_SW_MIX ) iwork[j] = 2; + else if( *FSWAP == HPL_SWAP03 ) iwork[j] = 3; j++; } (void) HPL_broadcast( (void*)iwork, lwork, HPL_INT, 0, @@ -746,6 +783,7 @@ if( iwork[j] == 0 ) *FSWAP = HPL_SWAP00; else if( iwork[j] == 1 ) *FSWAP = HPL_SWAP01; else if( iwork[j] == 2 ) *FSWAP = HPL_SW_MIX; + else if( iwork[j] == 3 ) *FSWAP = HPL_SWAP03; j++; } if( iwork ) free( iwork ); @@ -766,6 +804,20 @@ HPL_fprintf( TEST->outfp, "%s%s\n", "======================================", "======================================" ); +#ifdef HPL_CALL_ACCEL + HPL_fprintf( TEST->outfp, "%s%s\n", + "======================================", + "======================================" ); + HPL_fprintf( TEST->outfp, "%s%s\n", + "Modified for hybrid architectures -- ", + " April 30, 2008" ); + HPL_fprintf( TEST->outfp, "%s%s\n", + "by M. Kistler, J. Gunnels, D. Brokenshire, and B. Benton, ", + "IBM Corporation" ); + HPL_fprintf( TEST->outfp, "%s%s\n", + "======================================", + "======================================" ); +#endif HPL_fprintf( TEST->outfp, "\n%s\n", "An explanation of the input/output parameters follows:" ); @@ -1061,6 +1113,8 @@ HPL_fprintf( TEST->outfp, " Spread-roll (long)" ); else if( *FSWAP == HPL_SW_MIX ) HPL_fprintf( TEST->outfp, " Mix (threshold = %d)", *TSWAP ); + else if( *FSWAP == HPL_SWAP03 ) + HPL_fprintf( TEST->outfp, " MPI Collectives" ); /* * L1 storage form */ Index: testing/ptest/HPL_pdtest.c =================================================================== RCS file: /cvsroot/hpl_qs22/testing/ptest/HPL_pdtest.c,v retrieving revision 1.1 retrieving revision 1.9 diff -u -r1.1 -r1.9 --- testing/ptest/HPL_pdtest.c 10 Feb 2008 21:45:52 -0000 1.1 +++ testing/ptest/HPL_pdtest.c 26 Aug 2008 13:24:26 -0000 1.9 @@ -44,6 +44,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ +/* ------------------------------------------------------------------ */ +/* Modifications (C) Copyright IBM Corporation 2008 */ +/* ------------------------------------------------------------------ */ /* * Include files */ @@ -160,7 +163,14 @@ /* * Allocate dynamic memory */ - vptr = (void*)malloc( (ALGO->align + (mat.ld+1)*(mat.nq))*sizeof(double) ); + size_t mem_align = mat.nb*mat.nb; + if ( (mem_align % ALGO->align) != 0 ) mem_align *= ALGO->align; + size_t mem_size = (mem_align + (mat.ld+1)*(((mat.nq+mat.nb-1)/mat.nb)*mat.nb))*sizeof(double); +#ifdef HPL_USE_HUGE_PAGES + vptr = (void*)HPL_hpalloc( mem_size ); +#else + vptr = (void*)malloc( mem_size ); +#endif info[0] = (vptr == NULL); info[1] = myrow; info[2] = mycol; (void) HPL_all_reduce( (void *)(info), 3, HPL_INT, HPL_max, GRID->all_comm ); @@ -176,7 +186,8 @@ /* * generate matrix and right-hand-side, [ A | b ] which is N by N+1. */ - mat.A = (double *)HPL_PTR( vptr, + double *xptr = (double *)HPL_PTR( vptr, mem_align * sizeof(double) ); + mat.A = (double *)HPL_PTR( xptr, ((size_t)(ALGO->align) * sizeof(double) ) ); mat.X = mat.A + (mat.ld * mat.nq); HPL_pdmatgen( GRID, N, N+1, NB, mat.A, mat.ld, HPL_ISEED ); @@ -288,6 +299,27 @@ "+ Max aggregated wall time laswp . . : %18.2f\n", HPL_w[HPL_TIMING_LASWP-HPL_TIMING_BEG] ); /* + * Swap allgather time + */ + if( HPL_w[HPL_TIMING_ALLGATHER-HPL_TIMING_BEG] > HPL_rzero ) + HPL_fprintf( TEST->outfp, + "+ + Max aggregated wall time allgather:%18.2f\n", + HPL_w[HPL_TIMING_ALLGATHER-HPL_TIMING_BEG] ); +/* + * Swap scatter time + */ + if( HPL_w[HPL_TIMING_SCATTER-HPL_TIMING_BEG] > HPL_rzero ) + HPL_fprintf( TEST->outfp, + "+ + Max aggregated wall time scatter : %18.2f\n", + HPL_w[HPL_TIMING_SCATTER-HPL_TIMING_BEG] ); +/* + * Accelerator overhead (setup & cleanup) + */ + if( HPL_w[HPL_TIMING_ACCEL_OVERHEAD-HPL_TIMING_BEG] > HPL_rzero ) + HPL_fprintf( TEST->outfp, + "Max aggregated wall time accel ovhd : %18.2f\n", + HPL_w[HPL_TIMING_ACCEL_OVERHEAD-HPL_TIMING_BEG] ); +/* * Upper triangular system solve */ if( HPL_w[HPL_TIMING_PTRSV-HPL_TIMING_BEG] > HPL_rzero ) @@ -305,7 +337,11 @@ * Quick return, if I am not interested in checking the computations */ if( TEST->thrsh <= HPL_rzero ) +#ifdef HPL_USE_HUGE_PAGES + { (TEST->kpass)++; if( vptr ) HPL_hpfree( vptr ); return; } +#else { (TEST->kpass)++; if( vptr ) free( vptr ); return; } +#endif /* * Check info returned by solve */ @@ -315,7 +351,11 @@ HPL_pwarn( TEST->outfp, __LINE__, "HPL_pdtest", "%s %d, %s", "Error code returned by solve is", mat.info, "skip" ); (TEST->kskip)++; +#ifdef HPL_USE_HUGE_PAGES + if( vptr ) HPL_hpfree( vptr ); return; +#else if( vptr ) free( vptr ); return; +#endif } /* * Check computation, re-generate [ A | b ], compute norm 1 and inf of A and x, @@ -404,7 +444,11 @@ "||x||_1 . . . . . . . . . . . . . . . . . . . = ", Xnorm1 ); } } +#ifdef HPL_USE_HUGE_PAGES + if( vptr ) HPL_hpfree( vptr ); +#else if( vptr ) free( vptr ); +#endif /* * End of HPL_pdtest */