From fe3c778c51de8ca541b3ad158e783d6826128312 Mon Sep 17 00:00:00 2001
From: kavanabhat <Kavana.bhat@in.ibm.com>
Date: Thu, 30 Sep 2021 06:06:27 -0500
Subject: [PATCH 1/2] AIX changes for P10 with GNU Compiler

---
 kernel/power/KERNEL.POWER10           |   5 +-
 kernel/power/caxpy_microk_power10.c   |  29 ++-
 kernel/power/ccopy_microk_power10.c   |  45 +++-
 kernel/power/cdot.c                   |   4 +-
 kernel/power/cdot_microk_power10.c    |   8 +
 kernel/power/cgemm_kernel_power10.S   |  36 ++-
 kernel/power/cgemm_macros_power10.S   | 306 ++++++++++++++++++++++++++
 kernel/power/cscal_microk_power10.c   |   4 +
 kernel/power/cswap.c                  |   4 +-
 kernel/power/dasum.c                  |   7 +-
 kernel/power/dgemv_n_microk_power10.c |  86 +++++++-
 kernel/power/dgemv_t_power10.c        |  36 ++-
 kernel/power/drot.c                   |   6 +-
 kernel/power/dscal.c                  |   8 +-
 kernel/power/dswap.c                  |   6 +-
 kernel/power/sasum.c                  |   6 +-
 kernel/power/srot.c                   |   6 +-
 kernel/power/sscal.c                  |   8 +-
 kernel/power/sswap.c                  |   6 +-
 kernel/power/zaxpy_microk_power10.c   |   8 +
 kernel/power/zgemm_kernel_power10.S   |   4 +-
 kernel/power/zgemm_macros_power10.S   | 301 +++++++++++++++++++++----
 kernel/power/zgemv_t_4.c              |   2 +-
 kernel/power/zscal.c                  |   6 +-
 kernel/power/zscal_microk_power10.c   |  37 +++-
 kernel/power/zswap.c                  |   4 +-
 param.h                               |   5 -
 27 files changed, 852 insertions(+), 131 deletions(-)

diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index 873653f1..50866c97 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -1,6 +1,3 @@
-ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
-include $(KERNELDIR)/KERNEL.POWER8
-else
 
 #SGEMM_BETA = ../generic/gemm_beta.c
 #DGEMM_BETA = ../generic/gemm_beta.c
@@ -44,6 +41,7 @@ DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 CGEMMKERNEL    = cgemm_kernel_power10.S
+#CGEMMKERNEL     = cgemm_kernel_8x4_power8.S
 CGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
 CGEMMITCOPY    = ../generic/zgemm_tcopy_8.c
 CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
@@ -219,4 +217,3 @@ QCABS_KERNEL	= ../generic/cabs.c
 CGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
 ZGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
 
-endif
diff --git a/kernel/power/caxpy_microk_power10.c b/kernel/power/caxpy_microk_power10.c
index 56a5ab47..902eba82 100644
--- a/kernel/power/caxpy_microk_power10.c
+++ b/kernel/power/caxpy_microk_power10.c
@@ -36,9 +36,12 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
 #endif
   const float *mvecp = mvec;
   /* We have to load reverse mask for big endian.  */
-  /* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+  __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; 
+#else
   __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
+#endif
+
   long ytmp;
 
   __asm__
@@ -112,6 +115,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
        "xvmaddasp	38, 58, 33	\n\t"
        "xvmaddasp	39, 59, 33	\n\t"
 
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "stxv        48, 0(%4)   \n\t"
+       "stxv        49, 16(%4)  \n\t"
+       "stxv        50, 32(%4)  \n\t"
+       "stxv        51, 48(%4)  \n\t"
+       "stxv        34, 64(%4)  \n\t"
+       "stxv        35, 80(%4)  \n\t"
+       "stxv        38, 96(%4)  \n\t"
+       "stxv        39, 112(%4) \n\t"
+#else 
        "stxv		49, 0(%4)	\n\t"
        "stxv		48, 16(%4)	\n\t"
        "stxv		51, 32(%4)	\n\t"
@@ -120,6 +133,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
        "stxv		34, 80(%4)	\n\t"
        "stxv		39, 96(%4)	\n\t"
        "stxv		38, 112(%4)	\n\t"
+#endif
 
        "addi		%4, %4, 128	\n\t"
        "xxperm 52, 40, %x10 \n\t"       // exchange real and imag part
@@ -163,6 +177,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
        "xvmaddasp	38, 58, 33	\n\t"
        "xvmaddasp	39, 59, 33	\n\t"
 
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "stxv        48, 0(%4)   \n\t"
+       "stxv        49, 16(%4)  \n\t"
+       "stxv        50, 32(%4)  \n\t"
+       "stxv        51, 48(%4)  \n\t"
+       "stxv        34, 64(%4)  \n\t"
+       "stxv        35, 80(%4)  \n\t"
+       "stxv        38, 96(%4)  \n\t"
+       "stxv        39, 112(%4) \n\t"
+#else
        "stxv		49, 0(%4)	\n\t"
        "stxv		48, 16(%4)	\n\t"
        "stxv		51, 32(%4)	\n\t"
@@ -171,6 +195,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
        "stxv		34, 80(%4)	\n\t"
        "stxv		39, 96(%4)	\n\t"
        "stxv		38, 112(%4)	\n\t"
+#endif
 
      "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
      :
diff --git a/kernel/power/ccopy_microk_power10.c b/kernel/power/ccopy_microk_power10.c
index 6c80f9cd..f30e1fa0 100644
--- a/kernel/power/ccopy_microk_power10.c
+++ b/kernel/power/ccopy_microk_power10.c
@@ -46,7 +46,16 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
 
        ".align	5		\n"
      "one%=:				\n\t"
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) 
+       "stxv        32, 0(%3)   \n\t"
+       "stxv        33, 16(%3)  \n\t"
+       "stxv        34, 32(%3)  \n\t"
+       "stxv        35, 48(%3)  \n\t"
+       "stxv        36, 64(%3)  \n\t"
+       "stxv        37, 80(%3)  \n\t"
+       "stxv        38, 96(%3)  \n\t"
+       "stxv        39, 112(%3) \n\t"
+#else
        "stxv		33, 0(%3)	\n\t"
        "stxv		32, 16(%3)	\n\t"
        "stxv		35, 32(%3)	\n\t"
@@ -55,11 +64,21 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
        "stxv		36, 80(%3)	\n\t"
        "stxv		39, 96(%3)	\n\t"
        "stxv		38, 112(%3)	\n\t"
+#endif
        "lxvp		32, 0(%2)	\n\t"
        "lxvp		34, 32(%2)	\n\t"
        "lxvp		36, 64(%2)	\n\t"
        "lxvp		38, 96(%2)	\n\t"
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "stxv        40, 128(%3) \n\t"
+       "stxv        41, 144(%3) \n\t"
+       "stxv        42, 160(%3) \n\t"
+       "stxv        43, 176(%3) \n\t"
+       "stxv        44, 192(%3) \n\t"
+       "stxv        45, 208(%3) \n\t"
+       "stxv        46, 224(%3) \n\t"
+       "stxv        47, 240(%3) \n\t"
+#else
        "stxv		41, 128(%3)	\n\t"
        "stxv		40, 144(%3)	\n\t"
        "stxv		43, 160(%3)	\n\t"
@@ -68,6 +87,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
        "stxv		44, 208(%3)	\n\t"
        "stxv		47, 224(%3)	\n\t"
        "stxv		46, 240(%3)	\n\t"
+#endif
        "lxvp		40, 128(%2)	\n\t"
        "lxvp		42, 160(%2)	\n\t"
        "lxvp		44, 192(%2)	\n\t"
@@ -81,7 +101,24 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
        "bgt		one%=		\n"
 
      "two%=:				\n\t"
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "stxv        32, 0(%3)   \n\t"
+       "stxv        33, 16(%3)  \n\t"
+       "stxv        34, 32(%3)  \n\t"
+       "stxv        35, 48(%3)  \n\t"
+       "stxv        36, 64(%3)  \n\t"
+       "stxv        37, 80(%3)  \n\t"
+       "stxv        38, 96(%3)  \n\t"
+       "stxv        39, 112(%3) \n\t"
+       "stxv        40, 128(%3) \n\t"
+       "stxv        41, 144(%3) \n\t"
+       "stxv        42, 160(%3) \n\t"
+       "stxv        43, 176(%3) \n\t"
+       "stxv        44, 192(%3) \n\t"
+       "stxv        45, 208(%3) \n\t"
+       "stxv        46, 224(%3) \n\t"
+       "stxv        47, 240(%3) \n\t"
+#else
        "stxv		33, 0(%3)	\n\t"
        "stxv		32, 16(%3)	\n\t"
        "stxv		35, 32(%3)	\n\t"
@@ -98,7 +135,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
        "stxv		44, 208(%3)	\n\t"
        "stxv		47, 224(%3)	\n\t"
        "stxv		46, 240(%3)	\n\t"
-
+#endif
      "#n=%1 x=%4=%2 y=%0=%3"
      :
        "=m" (*y),
diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c
index b9e2d2ce..c53fe0c0 100644
--- a/kernel/power/cdot.c
+++ b/kernel/power/cdot.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 
 #include "common.h"
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
 #include "cdot_microk_power10.c"
 #else
 #ifndef HAVE_KERNEL_8
@@ -120,7 +120,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 
     if ((inc_x == 1) && (inc_y == 1)) {
 
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
         BLASLONG n1 = n & -16;
 #else
         BLASLONG n1 = n & -8;
diff --git a/kernel/power/cdot_microk_power10.c b/kernel/power/cdot_microk_power10.c
index 399f2b18..9d42559c 100644
--- a/kernel/power/cdot_microk_power10.c
+++ b/kernel/power/cdot_microk_power10.c
@@ -29,7 +29,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 static void cdot_kernel_8 (long n, float *x, float *y, float *dot)
 {
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+  __vector unsigned char mask = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
+#else
   __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
+#endif
   __asm__
     (
        "dcbt		0, %2		\n\t"
@@ -153,7 +157,11 @@ static void cdot_kernel_8 (long n, float *x, float *y, float *dot)
        "xxswapd		33, 34		\n\t"
        "xvaddsp		35, 35, 32	\n\t"
        "xvaddsp		34, 34, 33	\n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xxpermdi 	34, 35, 34, 0 	\n\t"
+#else
        "xxpermdi	34, 34, 35, 2	\n\t"
+#endif
        "stxv		34, 0(%6)       \n\t"
 
      "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6"
diff --git a/kernel/power/cgemm_kernel_power10.S b/kernel/power/cgemm_kernel_power10.S
index e04f948d..fbd22aaa 100644
--- a/kernel/power/cgemm_kernel_power10.S
+++ b/kernel/power/cgemm_kernel_power10.S
@@ -76,11 +76,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "cgemm_macros_power10.S"
 
+#if (_AIX)
+.set	perm_const1, 0x0405060700010203
+.set	perm_const2, 0x0c0d0e0f08090a0b
+.set	save_permute_12, 0x1011121300010203	
+.set	save_permute_11, 0x18191a1b08090a0b
+#else
 .equ    perm_const1, 0x0405060700010203
 .equ    perm_const2, 0x0c0d0e0f08090a0b
 .equ save_permute_12, 0x0c0d0e0f1c1d1e1f
 .equ save_permute_11, 0x0405060714151617
-
+#endif
 
 
 #ifndef NEEDPARAM
@@ -172,24 +178,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /*load reverse permute mask for big endian
   uint128 = 0xc0d0e0f08090a0b0405060700010203
 */ 
-		
+#if (_AIX)
+	lis	T2,	(perm_const2>>48 & 0xFFFF)
+	lis	T1,	(perm_const1>>48 & 0xFFFF)
+	lis	T3,	(save_permute_12>>48 & 0xFFFF)
+	lis	T4,	(save_permute_11>>48 & 0xFFFF)
+
+	ori	T2,	T2,	(perm_const2>>32 & 0xFFFF)
+	ori	T1,	T1,	(perm_const1>>32 & 0xFFFF)
+	ori	T3,	T3,	(save_permute_12>>32 & 0xFFFF)
+	ori	T4,	T4,	(save_permute_11>>32 & 0xFFFF)
+#else
 	lis T2, perm_const2@highest
 	lis T1, perm_const1@highest
 	lis T3, save_permute_12@highest
 	lis T4, save_permute_11@highest
-
 	
 	ori T2, T2, perm_const2@higher
 	ori T1, T1, perm_const1@higher
 	ori T3, T3, save_permute_12@higher
 	ori T4, T4, save_permute_11@higher
-
+#endif
 	
 	rldicr T2, T2, 32, 31
 	rldicr T1, T1, 32, 31
 	rldicr T3, T3, 32, 31
 	rldicr T4, T4, 32, 31 
 
+#if (_AIX)
+	oris	T2,	T2,	(perm_const2>>16 & 0xFFFF)
+	oris	T1, T1,	(perm_const1>>16 & 0xFFFF)
+	oris	T3, T3,	(save_permute_12>>16 & 0xFFFF)
+	oris	T4, T4,	(save_permute_11>>16 & 0xFFFF)
+
+	ori	T2, T2,	(perm_const2  & 0xFFFF)
+	ori	T1, T1,	(perm_const1 & 0xFFFF)
+	ori	T3, T3,	(save_permute_12 &  0xFFFF)
+	ori	T4, T4,	(save_permute_11 &  0xFFFF)	
+#else
 	oris T2, T2, perm_const2@h
 	oris T1, T1, perm_const1@h
 	oris T3, T3, save_permute_12@h
@@ -200,7 +226,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ori T1, T1, perm_const1@l
 	ori T3, T3, save_permute_12@l  
 	ori T4, T4, save_permute_11@l
-
+#endif
 	
   li r0,0
   li PRE,512
diff --git a/kernel/power/cgemm_macros_power10.S b/kernel/power/cgemm_macros_power10.S
index b66e9340..f75bf5da 100644
--- a/kernel/power/cgemm_macros_power10.S
+++ b/kernel/power/cgemm_macros_power10.S
@@ -218,6 +218,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .if \OffsetA != 0
 	addi	\AREG, \AREG, \OffsetA
 .endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	3, 36, 34					
+	xvf32gerpp	2, 37, 34
+	xvf32gerpp	1, 32, 34
+	xvf32gerpp	0, 33, 34
+	xvf32gerpp	7, 36, 35
+	xvf32gerpp	6, 37, 35
+	xvf32gerpp	5, 32, 35
+	xvf32gerpp	4, 33, 35
+#else
 	xvf32gerpp	3, 36, 35
 	xvf32gerpp	2, 37, 35
 	xvf32gerpp	1, 32, 35
@@ -226,6 +236,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvf32gerpp	6, 37, 34
 	xvf32gerpp	5, 32, 34
 	xvf32gerpp	4, 33, 34
+#endif
 .endm
 
 .macro	LOAD4x8_2
@@ -255,6 +266,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro	KERNEL4x8_2	AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+#if __BYTE_ORDER__ ==  __ORDER_BIG_ENDIAN__
+	xvf32gerpp	3, 36, 34 
+	xvf32gerpp	2, 37, 34
+	xvf32gerpp	1, 32, 34
+	xvf32gerpp	0, 33, 34
+	xvf32gerpp	7, 36, 35
+	xvf32gerpp	6, 37, 35
+	xvf32gerpp	5, 32, 35
+	xvf32gerpp	4, 33, 35
+#else
 	xvf32gerpp	3, 36, 35
 	xvf32gerpp	2, 37, 35
 	xvf32gerpp	1, 32, 35
@@ -263,11 +284,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvf32gerpp	6, 37, 34
 	xvf32gerpp	5, 32, 34
 	xvf32gerpp	4, 33, 34
+#endif
 .if \Complete==0
 	lxvp	vs34, DISP8(\Index, \OffsetB)(\BREG)
 	lxvp	vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
 	lxvp	vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
 .endif
+#if __BYTE_ORDER__ ==  __ORDER_BIG_ENDIAN__
+	xvf32gerpp	3, 42, 38
+	xvf32gerpp	2, 43, 38
+	xvf32gerpp	1, 40, 38
+	xvf32gerpp	0, 41, 38
+	xvf32gerpp	7, 42, 39
+	xvf32gerpp	6, 43, 39
+	xvf32gerpp	5, 40, 39
+	xvf32gerpp	4, 41, 39
+#else
 	xvf32gerpp	3, 42, 39
 	xvf32gerpp	2, 43, 39
 	xvf32gerpp	1, 40, 39
@@ -276,6 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvf32gerpp	6, 43, 38
 	xvf32gerpp	5, 40, 38
 	xvf32gerpp	4, 41, 38
+#endif
 .if \Complete==0
 	lxvp	vs40, DISP16(\Index, 64+\OffsetA)(\AREG)
 	lxvp	vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
@@ -393,22 +426,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	RECONSTRUCT_PAIR2
 #ifndef TRMMKERNEL
 	/* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs1, vs0, vs8, 1
+	xxpermdi	vs3, vs2, vs10, 1
+	xxpermdi	vs5, vs4, vs12, 1
+	xxpermdi	vs7, vs6, vs14, 1
+	xxpermdi	vs9, vs8, vs0, 1
+	xxpermdi	vs11, vs10, vs2, 1
+#else
 	xxpermdi	vs1, vs8, vs0, 2
 	xxpermdi	vs3, vs10, vs2, 2
 	xxpermdi	vs5, vs12, vs4, 2
 	xxpermdi	vs7, vs14, vs6, 2
 	xxpermdi	vs9, vs0, vs8, 2
 	xxpermdi	vs11, vs2, vs10, 2
+#endif
 	xvaddsp	vs24, vs24, vs3
 	xvaddsp	vs25, vs25, vs1
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs13, vs12, vs4, 1
+	xxpermdi	vs15, vs14, vs6, 1
+#else
 	xxpermdi	vs13, vs4, vs12, 2
 	xxpermdi	vs15, vs6, vs14, 2
+#endif
 	xvaddsp	vs26, vs26, vs7
 	xvaddsp	vs27, vs27, vs5
 	xvaddsp	vs28, vs28, vs11
 	xvaddsp	vs29, vs29, vs9
 	xvaddsp	vs30, vs30, vs15
 	xvaddsp	vs31, vs31, vs13
+#else
+#if __BYTE_ORDER__ ==  __ORDER_BIG_ENDIAN__
+	xxpermdi	vs25, vs0, vs8, 1
+	xxpermdi	vs24, vs2, vs10, 1
+	xxpermdi	vs27, vs4, vs12, 1
+	xxpermdi	vs26, vs6, vs14, 1
+	xxpermdi	vs29, vs8, vs0, 1
+	xxpermdi	vs28, vs10, vs2, 1
+	xxpermdi	vs31, vs12, vs4, 1
+	xxpermdi	vs30, vs14, vs6, 1
 #else
 	xxpermdi	vs25, vs8, vs0, 2
 	xxpermdi	vs24, vs10, vs2, 2
@@ -418,6 +475,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxpermdi	vs28, vs2, vs10, 2
 	xxpermdi	vs31, vs4, vs12, 2
 	xxpermdi	vs30, vs6, vs14, 2
+#endif
 #endif
 	stxvp	vs24, 0(CO)
 	MULT_APLHA_PART1    vs48, vs56, vs0, vs1
@@ -443,22 +501,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	RECONSTRUCT_PAIR2
 #ifndef TRMMKERNEL
   /* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs1, vs0, vs8, 1
+	xxpermdi	vs3, vs2, vs10, 1
+	xxpermdi	vs5, vs4, vs12, 1
+	xxpermdi	vs7, vs6, vs14, 1 
+	xxpermdi	vs9, vs8, vs0, 1
+	xxpermdi	vs11, vs10, vs2, 1
+#else
 	xxpermdi	vs1, vs8, vs0, 2
 	xxpermdi	vs3, vs10, vs2, 2
 	xxpermdi	vs5, vs12, vs4, 2
 	xxpermdi	vs7, vs14, vs6, 2
 	xxpermdi	vs9, vs0, vs8, 2
 	xxpermdi	vs11, vs2, vs10, 2
+#endif
 	xvaddsp	vs32, vs32, vs3
 	xvaddsp	vs33, vs33, vs1
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs13, vs12, vs4, 1
+	xxpermdi	vs15, vs14, vs6, 1
+#else
 	xxpermdi	vs13, vs4, vs12, 2
 	xxpermdi	vs15, vs6, vs14, 2
+#endif
 	xvaddsp	vs40, vs40, vs7
 	xvaddsp vs41, vs41, vs5
 	xvaddsp	vs34, vs34, vs11
 	xvaddsp	vs35, vs35, vs9
 	xvaddsp	vs42, vs42, vs15
 	xvaddsp	vs43, vs43, vs13
+#else
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	xxpermdi	vs33, vs0, vs8, 1
+	xxpermdi	vs32, vs2, vs10, 1
+	xxpermdi	vs41, vs4, vs12, 1 
+	xxpermdi	vs40, vs6, vs14, 1 
+	xxpermdi	vs35, vs8, vs0, 1 
+	xxpermdi	vs34, vs10, vs2, 1 
+	xxpermdi	vs43, vs12, vs4, 1
+	xxpermdi	vs42, vs14, vs6, 1 
 #else
 	xxpermdi	vs33, vs8, vs0, 2
 	xxpermdi	vs32, vs10, vs2, 2
@@ -468,6 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxpermdi	vs34, vs2, vs10, 2
 	xxpermdi	vs43, vs4, vs12, 2
 	xxpermdi	vs42, vs6, vs14, 2
+#endif
 #endif
 	stxvp	vs32, 0(T2)
 	stxvp	vs40, 32(T2)
@@ -510,10 +593,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .if \OffsetA != 0
 	addi	\AREG, \AREG, \OffsetA
 .endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	3, 32, 35			
+	xvf32gerpp	2, 33, 35
+	xvf32gerpp	1, 32, 34
+	xvf32gerpp	0, 33, 34
+#else
 	xvf32gerpp	3, 32, 34
 	xvf32gerpp	2, 33, 34
 	xvf32gerpp	1, 32, 35
 	xvf32gerpp	0, 33, 35
+#endif
 .endm
 
 .macro	LOAD4x4_2
@@ -541,18 +631,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro	KERNEL4x4_2	AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	3, 32, 35			
+	xvf32gerpp	2, 33, 35
+	xvf32gerpp	1, 32, 34
+	xvf32gerpp	0, 33, 34
+#else
 	xvf32gerpp	3, 32, 34
 	xvf32gerpp	2, 33, 34
 	xvf32gerpp	1, 32, 35
 	xvf32gerpp	0, 33, 35
+#endif
 .if \Complete==0
 	lxvp	vs34, DISP8(\Index, \OffsetB)(\BREG)
 	lxvp	vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
 .endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	3, 36, 39	
+	xvf32gerpp	2, 37, 39
+	xvf32gerpp	1, 36, 38
+	xvf32gerpp	0, 37, 38
+#else
 	xvf32gerpp	3, 36, 38
 	xvf32gerpp	2, 37, 38
 	xvf32gerpp	1, 36, 39
 	xvf32gerpp	0, 37, 39
+#endif
 .if \Complete==0
 	lxvp	vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
 	lxvp	vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
@@ -606,6 +710,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	RECONSTRUCT_PAIR2
 #ifndef TRMMKERNEL
   /* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs1, vs0, vs8, 1
+	xxpermdi	vs3, vs2, vs10, 1
+	xxpermdi	vs9, vs8, vs0, 1
+	xxpermdi	vs11, vs10, vs2, 1
+	xxpermdi	vs5, vs4, vs12, 1
+	xxpermdi	vs7, vs6, vs14, 1
+	xxpermdi	vs13, vs12, vs4, 1
+	xxpermdi	vs15, vs14, vs6, 1
+#else
 	xxpermdi	vs1, vs8, vs0, 2
 	xxpermdi	vs3, vs10, vs2, 2
 	xxpermdi	vs9, vs0, vs8, 2
@@ -614,6 +728,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxpermdi	vs7, vs14, vs6, 2
 	xxpermdi	vs13, vs4, vs12, 2
 	xxpermdi	vs15, vs6, vs14, 2
+#endif
 	xvaddsp	vs24, vs24, vs3
 	xvaddsp	vs25, vs25, vs1
 	xvaddsp	vs26, vs26, vs11
@@ -622,6 +737,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvaddsp	vs29, vs29, vs5
 	xvaddsp	vs30, vs30, vs15
 	xvaddsp	vs31, vs31, vs13
+#else
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs25, vs0, vs8, 1
+	xxpermdi	vs24, vs2, vs10, 1
+	xxpermdi	vs27, vs8, vs0, 1
+	xxpermdi	vs26, vs10, vs2, 1
+	xxpermdi	vs29, vs4, vs12, 1
+	xxpermdi	vs28, vs6, vs14, 1
+	xxpermdi	vs31, vs12, vs4, 1
+	xxpermdi	vs30, vs14, vs6, 1
 #else
 	xxpermdi	vs25, vs8, vs0, 2
 	xxpermdi	vs24, vs10, vs2, 2
@@ -631,6 +756,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxpermdi	vs28, vs14, vs6, 2
 	xxpermdi	vs31, vs4, vs12, 2
 	xxpermdi	vs30, vs6, vs14, 2
+#endif
 #endif
 	stxvp	vs24, 0(CO)
 	stxvp	vs26, 0(T1)
@@ -672,8 +798,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .if \OffsetA != 0
 	addi	\AREG, \AREG, \OffsetA
 .endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	1, 35, 32		
+	xvf32gerpp	0, 34, 32
+#else
 	xvf32gerpp	1, 34, 32
 	xvf32gerpp	0, 35, 32
+#endif
 .endm
 
 .macro	LOAD4x2_2
@@ -700,13 +831,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro	KERNEL4x2_2	AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	1, 35, 32		
+	xvf32gerpp	0, 34, 32
+#else
 	xvf32gerpp	1, 34, 33
 	xvf32gerpp	0, 35, 33
+#endif
 .if \Complete==0
 	lxvp	vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
 .endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	1, 37, 33		
+	xvf32gerpp	0, 36, 33
+#else
 	xvf32gerpp	1, 36, 32
 	xvf32gerpp	0, 37, 32
+#endif
 .if \Complete==0
 	lxvp	vs32, DISP4(\Index, \OffsetA)(\AREG)
 	lxvp	vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
@@ -757,19 +898,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	RECONSTRUCT_PAIR1
 #ifndef TRMMKERNEL
   /* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs1, vs0, vs8, 0
+	xxpermdi	vs9, vs2, vs10, 0
+	xxpermdi	vs3, vs8, vs0, 3
+	xxpermdi	vs11, vs10, vs2, 3
+#else
 	xxpermdi	vs1, vs8, vs0, 0
 	xxpermdi	vs9, vs10, vs2, 0
 	xxpermdi	vs3, vs0, vs8, 3
 	xxpermdi	vs11, vs2, vs10, 3
+#endif
 	xvaddsp	vs24, vs24, vs1
 	xvaddsp	vs26, vs26, vs9
 	xvaddsp	vs25, vs25, vs3
 	xvaddsp	vs27, vs27, vs11
+#else
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs24, vs0, vs8, 0
+	xxpermdi	vs26, vs2, vs10, 0
+	xxpermdi	vs25, vs8, vs0, 3
+	xxpermdi	vs27, vs10, vs2, 3
 #else
 	xxpermdi	vs24, vs8, vs0, 0
 	xxpermdi	vs26, vs10, vs2, 0
 	xxpermdi	vs25, vs0, vs8, 3
 	xxpermdi	vs27, vs2, vs10, 3
+#endif
 #endif
 	stxv	vs24, 0(CO)
 	stxv	vs25, 0(T1)
@@ -811,8 +966,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .if \OffsetA != 0
 	addi  \AREG, \AREG, \OffsetA
 .endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	0, 34, 32		
+	xvf32gerpp	1, 35, 32
+#else
 	xvf32gerpp	    0, 35, 32
 	xvf32gerpp	    1, 34, 32
+#endif
 .endm
 
 .macro	LOAD4x1_2
@@ -822,8 +982,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro	LOAD4x1_2O  OffsetA, OffsetB
 	lxv	vs32, (\OffsetA)(AO)
 	vspltisb        v6, 0
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs33, vs32, vs38, 2		
+	xxpermdi	vs32, vs32, vs38, 0
+#else
 	xxpermdi        vs33, vs32, vs38, 0
 	xxpermdi        vs32, vs32, vs38, 2
+#endif
 	lxvp	vs34, (0+\OffsetB)(BO)
 	lxvp	vs36, (32+\OffsetB)(BO)
 .endm
@@ -842,18 +1007,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro	KERNEL4x1_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	0, 34, 32	
+	xvf32gerpp	1, 35, 32
+#else
 	xvf32gerpp	    0, 35, 32
 	xvf32gerpp	    1, 34, 32
+#endif
 .if \Complete==0
 	lxvp	vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
 .endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	0, 36, 33	
+	xvf32gerpp	1, 37, 33
+#else
 	xvf32gerpp	    0, 37, 33
 	xvf32gerpp	    1, 36, 33
+#endif
 .if \Complete==0
 	lxv	vs32, DISP2(\Index, \OffsetA)(\AREG)
 	lxvp	vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi        vs33, vs32, vs38, 2
+	xxpermdi        vs32, vs32, vs38, 0
+#else
 	xxpermdi        vs33, vs32, vs38, 0
 	xxpermdi        vs32, vs32, vs38, 2
+#endif
 .endif
 .if \IsLast==1
 .if \Complete==1
@@ -1001,19 +1181,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro	KERNEL2x8_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	2, 37, 34
+	xvf32gerpp	3, 36, 34
+	xvf32gerpp	0, 33, 34
+	xvf32gerpp	1, 32, 34
+#else
 	xvf32gerpp	2, 37, 35
 	xvf32gerpp	3, 36, 35
 	xvf32gerpp	0, 33, 35
 	xvf32gerpp	1, 32, 35
+#endif
 
 .if \Complete==0
 	lxvp	vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
 	lxvp	vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
 .endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	2, 41, 35
+	xvf32gerpp	3, 40, 35
+	xvf32gerpp	0, 39, 35
+	xvf32gerpp	1, 38, 35
+#else
 	xvf32gerpp	2, 41, 34
 	xvf32gerpp	3, 40, 34
 	xvf32gerpp	0, 39, 34
 	xvf32gerpp	1, 38, 34
+#endif
 
 .if \Complete==0
 	lxvp	vs34, DISP4(\Index, \OffsetB)(\BREG)
@@ -1068,16 +1262,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	RECONSTRUCT_PAIR2
 #ifndef TRMMKERNEL
   /* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs1, vs0, vs8, 1
+	xxpermdi	vs3, vs2, vs10, 1
+	xxpermdi	vs5, vs4, vs12, 1
+	xxpermdi	vs7, vs6, vs14, 1
+	xxpermdi	vs9, vs8, vs0, 1
+	xxpermdi	vs11, vs10, vs2, 1
+#else
 	xxpermdi	vs1, vs8, vs0, 2
 	xxpermdi	vs3, vs10, vs2, 2
 	xxpermdi	vs5, vs12, vs4, 2
 	xxpermdi	vs7, vs14, vs6, 2
 	xxpermdi	vs9, vs0, vs8, 2
 	xxpermdi	vs11, vs2, vs10, 2
+#endif
 	xvaddsp	vs24, vs24, vs3
 	xvaddsp	vs25, vs25, vs1
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs13, vs12, vs4, 1
+	xxpermdi	vs15, vs14, vs6, 1
+#else
 	xxpermdi	vs13, vs4, vs12, 2
 	xxpermdi	vs15, vs6, vs14, 2
+#endif
 	xvaddsp	vs26, vs26, vs7
 	xvaddsp	vs27, vs27, vs5
 	xvaddsp	vs28, vs28, vs11
@@ -1085,6 +1293,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvaddsp	vs30, vs30, vs15
 	xvaddsp	vs31, vs31, vs13
 #else
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs25, vs0, vs8, 1 
+	xxpermdi	vs24, vs2, vs10, 1 
+	xxpermdi	vs27, vs4, vs12, 1
+	xxpermdi	vs26, vs6, vs14, 1 
+	xxpermdi	vs29, vs8, vs0, 1 
+	xxpermdi	vs28, vs10, vs2, 1 
+	xxpermdi	vs31, vs12, vs4, 1 
+	xxpermdi	vs30, vs14, vs6, 1 
+#else 
 	xxpermdi	vs25, vs8, vs0, 2
 	xxpermdi	vs24, vs10, vs2, 2
 	xxpermdi	vs27, vs12, vs4, 2
@@ -1093,6 +1311,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxpermdi	vs28, vs2, vs10, 2
 	xxpermdi	vs31, vs4, vs12, 2
 	xxpermdi	vs30, vs6, vs14, 2
+#endif
 #endif
 	stxvp	vs24, 0(CO)
 	stxvp	vs26, 32(CO)
@@ -1161,13 +1380,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro	KERNEL2x4_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	0, 33, 34		
+	xvf32gerpp	1, 32, 34
+#else
 	xvf32gerpp	0, 33, 35
 	xvf32gerpp	1, 32, 35
+#endif
 .if \Complete==0
 	lxvp	vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
 .endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	0, 37, 35		
+	xvf32gerpp	1, 36, 35
+#else
 	xvf32gerpp	0, 37, 34
 	xvf32gerpp	1, 36, 34
+#endif
+
 .if \Complete==0
 	lxvp	vs34, DISP4(\Index, \OffsetB)(\BREG)
 	lxvp	vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
@@ -1206,19 +1436,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	RECONSTRUCT_PAIR1
 #ifndef TRMMKERNEL
   /* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs1, vs0, vs8, 1
+	xxpermdi	vs3, vs2, vs10, 1
+	xxpermdi	vs9, vs8, vs0, 1
+	xxpermdi	vs11, vs10, vs2, 1
+#else
 	xxpermdi	vs1, vs8, vs0, 2
 	xxpermdi	vs3, vs10, vs2, 2
 	xxpermdi	vs9, vs0, vs8, 2
 	xxpermdi	vs11, vs2, vs10, 2
+#endif
 	xvaddsp	vs24, vs24, vs3
 	xvaddsp	vs25, vs25, vs1
 	xvaddsp	vs26, vs26, vs11
 	xvaddsp	vs27, vs27, vs9
+#else
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs25, vs0, vs8, 1
+	xxpermdi	vs24, vs2, vs10, 1
+	xxpermdi	vs27, vs8, vs0, 1
+	xxpermdi	vs26, vs10, vs2, 1
 #else
 	xxpermdi	vs25, vs8, vs0, 2
 	xxpermdi	vs24, vs10, vs2, 2
 	xxpermdi	vs27, vs0, vs8, 2
 	xxpermdi	vs26, vs2, vs10, 2
+#endif
 #endif
 	stxvp	vs24, 0(CO)
 	stxvp	vs26, 0(T1)
@@ -1330,13 +1574,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxperm	vs8, vs9, save_permute_1
 #ifndef TRMMKERNEL
   /* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs1, vs0, vs8, 0
+	xxpermdi	vs9, vs8, vs0, 3
+#else
 	xxpermdi	vs1, vs8, vs0, 0
 	xxpermdi	vs9, vs0, vs8, 3
+#endif
 	xvaddsp	vs24, vs24, vs1
 	xvaddsp	vs26, vs26, vs9
+#else
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs24, vs0, vs8, 0
+	xxpermdi	vs26, vs8, vs0, 3
 #else
 	xxpermdi	vs24, vs8, vs0, 0
 	xxpermdi	vs26, vs0, vs8, 3
+#endif
 #endif
 	stxv	vs24, 0(CO)
 	stxv	vs26, 0(T1)
@@ -1528,8 +1782,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	lxvp	vs32, (0+\OffsetA)(AO)
 	lxvp	vs36, (32+\OffsetA)(AO)
 	vspltisb        v10, 0
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs35, vs34, vs42, 2
+	xxpermdi	vs34, vs34, vs42, 0
+#else
 	xxpermdi        vs35, vs34, vs42, 0
 	xxpermdi        vs34, vs34, vs42, 2
+#endif
 	lxvp	vs38, (64+\OffsetA)(AO)
 	lxvp	vs40, (64+32+\OffsetA)(AO)
 .endm
@@ -1567,8 +1826,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvf32gerpp	    3, 35, 40
 .if \Complete==0
 	lxv	vs34, DISP2(\Index, \OffsetB)(\BREG)
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs35, vs34, vs42, 2
+	xxpermdi	vs34, vs34, vs42, 0
+#else
 	xxpermdi        vs35, vs34, vs42, 0
 	xxpermdi        vs34, vs34, vs42, 2
+#endif
 	lxvp	vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
 .endif
 .if \IsLast==1
@@ -1634,10 +1898,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	MULT_APLHA_PART2    vs34, vs42, vs4, vs5
 	MULT_APLHA_PART2    vs35, vs43, vs6, vs7
 /* reconstruct r, i pairs*/
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxperm	vs0, vs1, save_permute_1            
+	xxperm	vs2, vs3, save_permute_1           
+	xxperm	vs4, vs5, save_permute_1          
+	xxperm	vs6, vs7, save_permute_1 
+#else
 	xxperm	vs0, vs1, vs28
 	xxperm	vs2, vs3, vs28
 	xxperm	vs4, vs5, vs28
 	xxperm	vs6, vs7, vs28
+#endif
 #ifndef TRMMKERNEL
   /* add */
 	xvaddsp	vs24, vs24, vs2
@@ -1648,10 +1919,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvp	vs26, 32(CO)
 #else
 /* reconstruct r, i pairs*/
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxv    vs2, 0(CO)
+	stxv    vs0, 16(CO)
+	stxv    vs6, 32(CO)
+	stxv    vs4, 48(CO)
+#else
 	stxv	vs0, 0(CO)
 	stxv	vs2, 16(CO)
 	stxv	vs4, 32(CO)
 	stxv	vs6, 48(CO)
+#endif
 #endif
 	addi  CO, CO, 64
 .endm
@@ -1701,8 +1979,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	lxv	vs34, (\OffsetB)(BO)
 	lxvp	vs32, (0+\OffsetA)(AO)
 	vspltisb        v6, 0
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs35, vs34, vs38, 2			
+	xxpermdi	vs34, vs34, vs38, 0
+#else
 	xxpermdi        vs35, vs34, vs38, 0
 	xxpermdi        vs34, vs34, vs38, 2
+#endif
 	lxvp	vs36, (32+\OffsetA)(AO)
 .endm
 
@@ -1729,8 +2012,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvf32gerpp	    1, 35, 36
 .if \Complete==0
 	lxv	vs34, DISP2(\Index, \OffsetB)(\BREG)
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs35, vs34, vs38, 2		
+	xxpermdi	vs34, vs34, vs38, 0
+#else
 	xxpermdi        vs35, vs34, vs38, 0
 	xxpermdi        vs34, vs34, vs38, 2
+#endif
 	lxvp	vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
 .endif
 .if \IsLast==1
@@ -1775,8 +2063,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	MULT_APLHA_PART2    vs32, vs40, vs0, vs1
 	MULT_APLHA_PART2    vs33, vs41, vs2, vs3
 /* reconstruct r, i pairs*/
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxperm	vs0, vs1, save_permute_1			
+	xxperm	vs2, vs3, save_permute_1
+#else
 	xxperm	vs0, vs1, vs28
 	xxperm	vs2, vs3, vs28
+#endif
 #ifndef TRMMKERNEL
   /* add */
 	xvaddsp	vs24, vs24, vs2
@@ -1784,8 +2077,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvp	vs24, 0(CO)
 #else
 /* reconstruct r, i pairs*/
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxv	vs2, 0(CO)
+	stxv	vs0, 16(CO)
+#else
 	stxv	vs0, 0(CO)
 	stxv	vs2, 16(CO)
+#endif
 #endif
 	addi  CO, CO, 32
 .endm
@@ -1904,7 +2202,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	MULT_APLHA_PART1    vs32, vs40, vs0, vs1
 	MULT_APLHA_PART2    vs32, vs40, vs0, vs1
 /* reconstruct r, i pairs*/
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxperm	vs0, vs1, save_permute_1
+#else
 	xxperm	vs0, vs1, vs28
+#endif
 #ifndef TRMMKERNEL
   /* add */
 	xvaddsp	vs24, vs24, vs0
@@ -2018,7 +2320,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	MULT_APLHA_PART1    vs32, vs40, vs37, vs1
 	MULT_APLHA_PART2    vs32, vs40, vs37, vs1
 /* reconstruct r, i pairs*/
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxperm	vs37, vs1, save_permute_1
+#else
 	xxperm	vs37, vs1, vs28
+#endif
 #ifndef TRMMKERNEL
   /* add */
 	xvaddsp	vs36, vs36, vs37
diff --git a/kernel/power/cscal_microk_power10.c b/kernel/power/cscal_microk_power10.c
index 70b50809..d6a91f07 100644
--- a/kernel/power/cscal_microk_power10.c
+++ b/kernel/power/cscal_microk_power10.c
@@ -30,7 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i)
 {
   __vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i};
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+  __vector unsigned char mask = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
+#else
   __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
+#endif
   __asm__
     (
        "dcbt		0, %2		\n\t"
diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c
index c2fde1c4..4d9b9ccd 100644
--- a/kernel/power/cswap.c
+++ b/kernel/power/cswap.c
@@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8)  || defined(POWER9)
 #include "cswap_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-#include "cswap_microk_power10.c"
 #elif defined(POWER10)
-#include "cswap_microk_power8.c"
+#include "cswap_microk_power10.c"
 #endif
 #endif
 
diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c
index 35390dd2..9ed0af76 100644
--- a/kernel/power/dasum.c
+++ b/kernel/power/dasum.c
@@ -49,14 +49,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "dasum_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-#include "dasum_microk_power10.c"
 #elif defined(POWER10)
-#include "dasum_microk_power8.c"
+#include "dasum_microk_power10.c"
 #endif
 #endif
 
-
 #ifndef HAVE_KERNEL_16
 
 static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
@@ -114,7 +111,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if ( inc_x == 1 )
 	{
 
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
 		if ( n >= 32)
 		{
 			BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
diff --git a/kernel/power/dgemv_n_microk_power10.c b/kernel/power/dgemv_n_microk_power10.c
index e47de2cb..65743731 100644
--- a/kernel/power/dgemv_n_microk_power10.c
+++ b/kernel/power/dgemv_n_microk_power10.c
@@ -40,18 +40,27 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
        XXSPLTD_S(32,%x9,0)	// alpha, alpha
 
        "sldi		%6, %13, 3	\n\t"	// lda * sizeof (double)
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmuldp     34, 40, 32  \n\t"   // x0 * alpha, x1 * alpha
+       "xvmuldp     35, 41, 32  \n\t"	// x2 * alpha, x3 * alpha
+#else
        "xvmuldp		34, 41, 32	\n\t"	// x0 * alpha, x1 * alpha
        "xvmuldp		35, 40, 32	\n\t"	// x2 * alpha, x3 * alpha
+#endif
 
        "add		%4, %3, %6	\n\t"	// a0 = ap, a1 = a0 + lda
        "add		%6, %6, %6	\n\t"	// 2 * lda
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       XXSPLTD_S(32,34,0)   // x0 * alpha, x0 * alpha
+       XXSPLTD_S(33,34,1)   // x1 * alpha, x1 * alpha
+       XXSPLTD_S(34,35,0)   // x2 * alpha, x2 * alpha
+       XXSPLTD_S(35,35,1)   // x3 * alpha, x3 * alpha
+#else
        XXSPLTD_S(32,34,1)	// x0 * alpha, x0 * alpha
        XXSPLTD_S(33,34,0)	// x1 * alpha, x1 * alpha
        XXSPLTD_S(34,35,1)	// x2 * alpha, x2 * alpha
        XXSPLTD_S(35,35,0)	// x3 * alpha, x3 * alpha
-
+#endif
        "add		%5, %3, %6	\n\t"	// a2 = a0 + 2 * lda
        "add		%6, %4, %6	\n\t"	// a3 = a1 + 2 * lda
 
@@ -286,6 +295,16 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
 
        "add		%4, %3, %10	\n\t"	// a0 = ap, a1 = a0 + lda
        "add		%10, %10, %10	\n\t"	// 2 * lda
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       XXSPLTD_S(32,34,0)       // x0 * alpha, x0 * alpha
+       XXSPLTD_S(33,34,1)       // x1 * alpha, x1 * alpha
+       XXSPLTD_S(34,35,0)       // x2 * alpha, x2 * alpha
+       XXSPLTD_S(35,35,1)       // x3 * alpha, x3 * alpha
+       XXSPLTD_S(48,39,0)       // x6 * alpha, x6 * alpha
+       XXSPLTD_S(49,39,1)       // x7 * alpha, x7 * alpha
+       XXSPLTD_S(39,38,1)       // x5 * alpha, x5 * alpha
+       XXSPLTD_S(38,38,0)       // x4 * alpha, x4 * alpha
+#else
        XXSPLTD_S(32,34,1)       // x0 * alpha, x0 * alpha
        XXSPLTD_S(33,34,0)       // x1 * alpha, x1 * alpha
        XXSPLTD_S(34,35,1)       // x2 * alpha, x2 * alpha
@@ -294,6 +313,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
        XXSPLTD_S(49,39,0)       // x7 * alpha, x7 * alpha
        XXSPLTD_S(39,38,0)       // x5 * alpha, x5 * alpha
        XXSPLTD_S(38,38,1)       // x4 * alpha, x4 * alpha
+#endif
 
        "add		%5, %3, %10	\n\t"	// a2 = a0 + 2 * lda
        "add		%6, %4, %10	\n\t"	// a3 = a1 + 2 * lda
@@ -319,30 +339,69 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
      "one%=:				\n\t"
 
        "lxvp		36, 0( %2)	\n\t"	// y0, y1
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmaddadp       36, 40, 32      \n\t"
+       "xvmaddadp       37, 41, 32      \n\t"
+#else
        "xvmaddadp       36, 40, 34      \n\t"
        "xvmaddadp       37, 41, 34      \n\t"
+#endif
        "lxvpx		40, %3, %11	\n\t"	// a0[0], a0[1]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmaddadp       36, 42, 33      \n\t"
+       "xvmaddadp       37, 43, 33      \n\t"
+#else
        "xvmaddadp       36, 42, 35      \n\t"
        "xvmaddadp       37, 43, 35      \n\t"
+#endif
        "lxvpx		42, %4, %11	\n\t"	// a1[0], a1[1]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmaddadp       36, 44, 34      \n\t"
+       "xvmaddadp       37, 45, 34      \n\t"
+#else
        "xvmaddadp       36, 44, 32      \n\t"
        "xvmaddadp       37, 45, 32      \n\t"
+#endif
        "lxvpx		44, %5, %11	\n\t"	// a2[0], a2[1]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmaddadp       36, 46, 35      \n\t"
+       "xvmaddadp       37, 47, 35      \n\t"
+#else
        "xvmaddadp       36, 46, 33      \n\t"
        "xvmaddadp       37, 47, 33      \n\t"
+#endif
        "lxvpx		46, %6, %11	\n\t"	// a3[0], a3[1]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmaddadp       36, 50, 38      \n\t"
+       "xvmaddadp       37, 51, 38      \n\t"
+#else
        "xvmaddadp       36, 50, 48      \n\t"
        "xvmaddadp       37, 51, 48      \n\t"
+#endif
        "lxvpx		50, %7, %11	\n\t"	// a4[0]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmaddadp       36, 52, 39      \n\t"
+       "xvmaddadp       37, 53, 39      \n\t"
+#else
        "xvmaddadp       36, 52, 49      \n\t"
        "xvmaddadp       37, 53, 49      \n\t"
+#endif
        "lxvpx		52, %8, %11	\n\t"	// a5[0]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmaddadp       36, 54, 48      \n\t"
+       "xvmaddadp       37, 55, 48      \n\t"
+#else
        "xvmaddadp       36, 54, 38      \n\t"
        "xvmaddadp       37, 55, 38      \n\t"
+#endif
        "lxvpx		54, %9, %11	\n\t"	// a6[0]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmaddadp       36, 56, 49      \n\t"
+       "xvmaddadp       37, 57, 49      \n\t"
+#else
        "xvmaddadp       36, 56, 39      \n\t"
        "xvmaddadp       37, 57, 39      \n\t"
+#endif
        "lxvpx		56, %10, %11	\n\t"	// a7[0]
        "addi		%11, %11, 32    \n\t"
 
@@ -355,6 +414,24 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
      "two%=:				\n\t"
 
        "lxvp		36, 0( %2)	\n\t"	// y0, y1
+#if  (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmaddadp       36, 40, 32      \n\t"
+       "xvmaddadp       37, 41, 32      \n\t"
+       "xvmaddadp       36, 42, 33      \n\t"
+       "xvmaddadp       37, 43, 33      \n\t"
+       "xvmaddadp       36, 44, 34      \n\t"
+       "xvmaddadp       37, 45, 34      \n\t"
+       "xvmaddadp       36, 46, 35      \n\t"
+       "xvmaddadp       37, 47, 35      \n\t"
+       "xvmaddadp       36, 50, 38      \n\t"
+       "xvmaddadp       37, 51, 38      \n\t"
+       "xvmaddadp       36, 52, 39      \n\t"
+       "xvmaddadp       37, 53, 39      \n\t"
+       "xvmaddadp       36, 54, 48      \n\t"
+       "xvmaddadp       37, 55, 48      \n\t"
+       "xvmaddadp       36, 56, 49      \n\t"
+       "xvmaddadp       37, 57, 49      \n\t"
+#else
        "xvmaddadp       36, 40, 34      \n\t"
        "xvmaddadp       37, 41, 34      \n\t"
        "xvmaddadp       36, 42, 35      \n\t"
@@ -371,6 +448,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
        "xvmaddadp       37, 55, 38      \n\t"
        "xvmaddadp       36, 56, 39      \n\t"
        "xvmaddadp       37, 57, 39      \n\t"
+#endif
        "stxvp		36, 0( %2)	\n\t"	// y0, y1
 
      :
diff --git a/kernel/power/dgemv_t_power10.c b/kernel/power/dgemv_t_power10.c
index 3db4d578..899b2a04 100644
--- a/kernel/power/dgemv_t_power10.c
+++ b/kernel/power/dgemv_t_power10.c
@@ -279,34 +279,58 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
             "lxvp 40, 32(%[y]) \n\t"
 
  
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+            XXMRGHD_S(42,34,35)
+            XXMRGLD_S(43,34,35)
 
+            XXMRGHD_S(44,4,5)
+            XXMRGLD_S(45,4,5)
+#else
             XXMRGLD_S(42,35,34)
             XXMRGHD_S(43,35,34)
 
             XXMRGLD_S(44,5,4)
             XXMRGHD_S(45,5,4)
+#endif
 
             "xvadddp 42,42,43 \n\t"
 
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+            XXMRGHD_S(46,6,7)
+            XXMRGLD_S(47,6,7)
+#else
             XXMRGLD_S(46,7,6)
             XXMRGHD_S(47,7,6)
-
+#endif
             "xvadddp 44,44,45 \n\t"
 
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+            XXMRGHD_S(48,8,9)
+            XXMRGLD_S(49,8,9)
+#else
             XXMRGLD_S(48,9,8)
             XXMRGHD_S(49,9,8)
-
+#endif
             "xvadddp 46,46,47 \n\t"
-            
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+            "xvmaddadp  38,42,36  \n\t"
+            "xvmaddadp  39,44,36  \n\t"
+#else
             "xvmaddadp  39,42,36  \n\t"
             "xvmaddadp  38,44,36  \n\t"
-            
+#endif
             "xvadddp 48,48,49 \n\t"
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+            "xvmaddadp  41,48,36  \n\t"
+#else
             "xvmaddadp  41,46,36  \n\t"
-
+#endif
             "stxvp 38, 0(%[y]) \n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+            "xvmaddadp  40,46,36  \n\t"
+#else
             "xvmaddadp  40,48,36  \n\t" 
+#endif
             "stxvp 40, 32(%[y])  \n\t"
                  
             : [memy] "+m" (*(double (*)[8])y),
diff --git a/kernel/power/drot.c b/kernel/power/drot.c
index 30c7411c..2aa0b805 100644
--- a/kernel/power/drot.c
+++ b/kernel/power/drot.c
@@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "drot_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-#include "drot_microk_power10.c"
 #elif defined(POWER10)
-#include "drot_microk_power8.c"
+#include "drot_microk_power10.c"
 #endif
 #endif
 
@@ -117,7 +115,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 	if ( (inc_x == 1) && (inc_y == 1) )
 	{
 
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
 		if ( n >= 16 )
 		{
 			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c
index 32c39a8f..96c4e51b 100644
--- a/kernel/power/dscal.c
+++ b/kernel/power/dscal.c
@@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "dscal_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-#include "dscal_microk_power10.c"
 #elif defined(POWER10)
-#include "dscal_microk_power8.c"
+#include "dscal_microk_power10.c"
 #endif
 #endif
 
@@ -104,7 +102,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		if ( da == 0.0 )
 		{		
 
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
 			if ( n >= 16 )
 			{
 				BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
@@ -138,7 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		else
 		{
 
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
 			if ( n >= 16 )
 			{
 				BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c
index 12476965..9e6229c6 100644
--- a/kernel/power/dswap.c
+++ b/kernel/power/dswap.c
@@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "dswap_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-#include "swap_microk_power10.c"
 #elif defined(POWER10)
-#include "dswap_microk_power8.c"
+#include "swap_microk_power10.c"
 #endif
 #endif
 
@@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
 	if ( (inc_x == 1) && (inc_y == 1 ))
 	{
 
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
 		if ( n >= 32 )
 		{
 			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c
index 991d2750..af692a7f 100644
--- a/kernel/power/sasum.c
+++ b/kernel/power/sasum.c
@@ -49,10 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "sasum_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-#include "sasum_microk_power10.c"
 #elif defined(POWER10)
-#include "sasum_microk_power8.c"
+#include "sasum_microk_power10.c"
 #endif
 #endif
 
@@ -114,7 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if ( inc_x == 1 )
 	{
 
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
 		if ( n >= 32 )
 		{
 			BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
diff --git a/kernel/power/srot.c b/kernel/power/srot.c
index 5a0d4b12..3e4f93e2 100644
--- a/kernel/power/srot.c
+++ b/kernel/power/srot.c
@@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "srot_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-#include "srot_microk_power10.c"
 #elif defined(POWER10)
-#include "srot_microk_power8.c"
+#include "srot_microk_power10.c"
 #endif
 #endif
 
@@ -119,7 +117,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 	if ( (inc_x == 1) && (inc_y == 1) )
 	{
 
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
 		if ( n >= 16 )
 		{
 			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c
index 9ae9ccab..65572a8c 100644
--- a/kernel/power/sscal.c
+++ b/kernel/power/sscal.c
@@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "sscal_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-#include "sscal_microk_power10.c"
 #elif defined(POWER10)
-#include "sscal_microk_power8.c"
+#include "sscal_microk_power10.c"
 #endif
 #endif
 
@@ -106,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		if ( da == 0.0 )
 		{		
 
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
 			if ( n >= 32 )
 			{
 				BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
@@ -140,7 +138,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		else
 		{
 
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
 			if ( n >= 32 )
 			{
 				BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c
index 955ed02f..dd249fd3 100644
--- a/kernel/power/sswap.c
+++ b/kernel/power/sswap.c
@@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "sswap_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-#include "swap_microk_power10.c"
 #elif defined(POWER10)
-#include "sswap_microk_power8.c"
+#include "swap_microk_power10.c"
 #endif
 #endif
 
@@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
 	if ( (inc_x == 1) && (inc_y == 1 ))
 	{
 
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
 		if ( n >= 64 )
 		{
 			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
diff --git a/kernel/power/zaxpy_microk_power10.c b/kernel/power/zaxpy_microk_power10.c
index 8e593bbf..b03508b0 100644
--- a/kernel/power/zaxpy_microk_power10.c
+++ b/kernel/power/zaxpy_microk_power10.c
@@ -30,9 +30,17 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
 			    double alpha_r, double alpha_i)
 {
 #if !defined(CONJ)
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+  static const double mvec[2] = { -1.0, 1.0 };
+#else
+  static const double mvec[2] = { 1.0, -1.0 };
+#endif
+#else
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
   static const double mvec[2] = { 1.0, -1.0 };
 #else
   static const double mvec[2] = { -1.0, 1.0 };
+#endif
 #endif
   const double *mvecp = mvec;
 
diff --git a/kernel/power/zgemm_kernel_power10.S b/kernel/power/zgemm_kernel_power10.S
index fca389e6..afee8f18 100644
--- a/kernel/power/zgemm_kernel_power10.S
+++ b/kernel/power/zgemm_kernel_power10.S
@@ -147,13 +147,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     std    r0, FLINK_SAVE(SP)
  
 
-#if defined(linux) || defined(__FreeBSD__)
+#if defined(linux) || defined(__FreeBSD__) || defined(_AIX)
 	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
 #endif
 
 
 #ifdef TRMMKERNEL
-#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__) || defined(_AIX)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
 #endif 
 #endif
diff --git a/kernel/power/zgemm_macros_power10.S b/kernel/power/zgemm_macros_power10.S
index 42f9c5ad..e5e5ec0e 100644
--- a/kernel/power/zgemm_macros_power10.S
+++ b/kernel/power/zgemm_macros_power10.S
@@ -41,23 +41,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef TRMMKERNEL 
   lxv	\VS_TEMP1,	DISPX(\LOFFSET)(\REG)
   lxv	\VS_TEMP2,	DISPX(\LOFFSET+16)(\REG)
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) 
+  xxmrghd  \VS_OUT1,\VS_TEMP1,\VS_TEMP2
+  xxmrgld  \VS_OUT2,\VS_TEMP1,\VS_TEMP2
+#else
   xxmrgld  \VS_OUT1,\VS_TEMP1,\VS_TEMP2
   xxmrghd  \VS_OUT2,\VS_TEMP1,\VS_TEMP2	
+#endif
 #endif	
 .endm
 /*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
 
 
 .macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    xxmrghd \VSOUT1, \VSIN1,\VSIN2 /*  real*real from 2 results*/
+    xxmrgld \VSOUT2, \VSIN1,\VSIN2 /*  imag*imag from 2 results*/
+#else
 	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*real from 2 results*/
 	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*imag from 2 results*/
+#endif
 .endm 
 /*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
 
 
 .macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    xxmrghd \VSOUT1, \VSIN1,\VSIN2 /*  real*imag */
+    xxmrgld \VSOUT2, \VSIN1,\VSIN2 /*  imag*real*/
+#else
 	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*imag */
 	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*real*/
+#endif
 .endm
 /* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
 
@@ -103,8 +118,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 .macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    xxmrghd  \VSOUT1,\VSIN1,\VSIN2
+    xxmrgld  \VSOUT2,\VSIN1,\VSIN2
+#else
 	xxmrghd  \VSOUT1,\VSIN2,\VSIN1
 	xxmrgld  \VSOUT2,\VSIN2,\VSIN1
+#endif
 .endm
 
 
@@ -186,15 +206,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35
 #ifndef TRMMKERNEL 
   lxv	vs50,	(\LOFFSET)(\BASE_REG) 
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+  xxmrghd  vs46,vs50,vs50
+  xxmrgld  vs47,vs50,vs50
+#else
   xxmrgld  vs46,vs50,vs50
   xxmrghd  vs47,vs50,vs50	
+#endif
 #endif	
   RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes2,vs36,vs37	
   AGGREGATE_REALS_IMAGES	vs34,vs35,vs36,vs37	
   MULT_APLHA_PART1	vs34,vs36, vs46,vs47	
   MULT_APLHA_PART2	vs34,vs36, vs46,vs47  
   UNPACK_FOR_STORE	vs46,vs47,vs39,vs41 
+#if (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
   xxmrghd  vs39,vs47,vs46	
+#endif
   stxv	vs39,	(\LOFFSET)(\BASE_REG) 
 .endm
 
@@ -232,6 +259,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	lxvp	vs44,	DISP16(\Index,192)(AO)	// load real,imag from A
 	lxvp	vs46,	DISP16(\Index,224)(AO)	// load real,imag from A
  	lxvp	vs50,	DISP4(\Index,  32)(BO)	// load real,imag from B
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp  0,	vs32,	vs48
+	xvf64gerpp  1,  vs34,   vs48
+	xvf64gerpp  2,  vs36,   vs48
+	xvf64gerpp  3,  vs38,   vs48
+	xvf64gerpp  4,  vs32,   vs49
+	xvf64gerpp  5,  vs34,   vs49
+	xvf64gerpp  6,  vs36,   vs49
+	xvf64gerpp  7,  vs38,   vs49
+#else
 	xvf64gerpp	0,	vs32,	vs49
 	xvf64gerpp	1,	vs34,	vs49
 	xvf64gerpp	2,	vs36,	vs49
@@ -240,11 +277,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvf64gerpp	5,	vs34,	vs48
 	xvf64gerpp	6,	vs36,	vs48
 	xvf64gerpp	7,	vs38,	vs48
+#endif
 	lxvp	vs32,	DISP16(\Index, 256)(AO)	// load real,imag from A
 	lxvp	vs34,	DISP16(\Index, 288)(AO)	// load real,imag from A
 	lxvp	vs36,	DISP16(\Index, 320)(AO)	// load real,imag from A
 	lxvp	vs38,	DISP16(\Index, 352)(AO)	// load real,imag from A
 	lxvp	vs48,	DISP4(\Index,  64)(BO)	// load real imag from B
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp  0,  vs40,   vs50
+	xvf64gerpp  1,  vs42,   vs50
+	xvf64gerpp  2,  vs44,   vs50
+	xvf64gerpp  3,  vs46,   vs50
+	xvf64gerpp  4,  vs40,   vs51
+	xvf64gerpp  5,  vs42,   vs51
+	xvf64gerpp  6,  vs44,   vs51
+	xvf64gerpp  7,  vs46,   vs51
+#else
 	xvf64gerpp	0,	vs40,	vs51
 	xvf64gerpp	1,	vs42,	vs51
 	xvf64gerpp	2,	vs44,	vs51
@@ -253,6 +301,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvf64gerpp	5,	vs42,	vs50
 	xvf64gerpp	6,	vs44,	vs50
 	xvf64gerpp	7,	vs46,	vs50
+#endif
 .if \IsLast==1
 	addi	AO, AO,  DISP16(\Index,256)
 	addi	BO, BO,  DISP4(\Index,64)
@@ -261,6 +310,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 .macro LOAD_END_2x8  OffsetA,OffsetB
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp  0,  vs32,   vs48
+	xvf64gerpp  1,  vs34,   vs48
+	xvf64gerpp  2,  vs36,   vs48
+	xvf64gerpp  3,  vs38,   vs48
+	xvf64gerpp  4,  vs32,   vs49
+	xvf64gerpp  5,  vs34,   vs49
+	xvf64gerpp  6,  vs36,   vs49
+	xvf64gerpp  7,  vs38,   vs49	
+#else
 	xvf64gerpp	0,	vs32,	vs49
 	xvf64gerpp	1,	vs34,	vs49
 	xvf64gerpp	2,	vs36,	vs49
@@ -269,6 +328,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvf64gerpp	5,	vs34,	vs48
 	xvf64gerpp	6,	vs36,	vs48
 	xvf64gerpp	7,	vs38,	vs48
+#endif
 	addi	BO, BO, \OffsetB
 	addi	AO, AO, \OffsetA
 .endm
@@ -305,7 +365,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         xxpermdi vs45, vs12, vs13, 0b10
         xxpermdi vs46, vs14, vs15, 0b01
         xxpermdi vs47, vs14, vs15, 0b10
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxlor vs0, vs32, vs32
+	xxlor vs1, vs33, vs33
+	xxlor vs2, vs34, vs34
+	xxlor vs3, vs35, vs35
+	xxlor vs4, vs36, vs36
+	xxlor vs5, vs37, vs37
+	xxlor vs6, vs38, vs38
+	xxlor vs7, vs39, vs39
+	xxlor vs8, vs40, vs40
+	xxlor vs9, vs41, vs41
+	xxlor vs10, vs42, vs42
+	xxlor vs11, vs43, vs43
+	xxlor vs12, vs44, vs44
+	xxlor vs13, vs45, vs45
+	xxlor vs14, vs46, vs46
+	xxlor vs15, vs47, vs47
+#else
 	xxlor vs2, vs32, vs32
 	xxlor vs3, vs33, vs33
 	xxlor vs0, vs34, vs34
@@ -322,7 +399,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxlor vs15, vs45, vs45
 	xxlor vs12, vs46, vs46
 	xxlor vs13, vs47, vs47
-
+#endif
         xxpermdi vs32, vs16, vs17, 0b01
         xxpermdi vs33, vs16, vs17, 0b10
         xxpermdi vs34, vs18, vs19, 0b01
@@ -339,7 +416,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         xxpermdi vs45, vs28, vs29, 0b10
         xxpermdi vs46, vs30, vs31, 0b01
         xxpermdi vs47, vs30, vs31, 0b10
-       
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxlor vs16, vs32, vs32
+	xxlor vs17, vs33, vs33
+	xxlor vs18, vs34, vs34
+	xxlor vs19, vs35, vs35
+	xxlor vs20, vs36, vs36
+	xxlor vs21, vs37, vs37
+	xxlor vs22, vs38, vs38
+	xxlor vs23, vs39, vs39
+	xxlor vs24, vs40, vs40
+	xxlor vs25, vs41, vs41
+	xxlor vs26, vs42, vs42
+	xxlor vs27, vs43, vs43
+	xxlor vs28, vs44, vs44
+	xxlor vs29, vs45, vs45
+	xxlor vs30, vs46, vs46
+	xxlor vs31, vs47, vs47
+#else
 	xxlor vs18, vs32, vs32
 	xxlor vs19, vs33, vs33
 	xxlor vs16, vs34, vs34
@@ -356,7 +450,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxlor vs31, vs45, vs45
 	xxlor vs28, vs46, vs46
 	xxlor vs29, vs47, vs47
-
+#endif
 	SAVE8  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
 	SAVE8  vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0  
 	addi	CO, CO, 128
@@ -388,17 +482,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	lxvp	vs40,	DISP8(\Index,  64)(AO)	// load real,imag from A
 	lxvp	vs42,	DISP8(\Index,  96)(AO)	// load real,imag from A
  	lxvp	vs50,	DISP4(\Index,  32)(BO)  // load real,imag from B
-        xvf64gerpp      0,      vs32,   vs49
-        xvf64gerpp      1,      vs34,   vs49
-        xvf64gerpp      2,      vs32,   vs48
-        xvf64gerpp      3,      vs34,   vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    xvf64gerpp      0,      vs32,   vs48
+    xvf64gerpp      1,      vs34,   vs48
+    xvf64gerpp      2,      vs32,   vs49
+    xvf64gerpp      3,      vs34,   vs49
+#else
+    xvf64gerpp      0,      vs32,   vs49
+    xvf64gerpp      1,      vs34,   vs49
+    xvf64gerpp      2,      vs32,   vs48
+    xvf64gerpp      3,      vs34,   vs48
+#endif
 	lxvp	vs32,	DISP8(\Index, 128)(AO)	// load real,imag from A
 	lxvp	vs34,	DISP8(\Index, 160)(AO)	// load real,imag from A
  	lxvp	vs48,	DISP4(\Index,  64)(BO)  // load real,imag from B
-        xvf64gerpp      0,      vs40,   vs51 
-        xvf64gerpp      1,      vs42,   vs51
-        xvf64gerpp      2,      vs40,   vs50
-        xvf64gerpp      3,      vs42,   vs50
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    xvf64gerpp      0,      vs40,   vs50
+    xvf64gerpp      1,      vs42,   vs50
+    xvf64gerpp      2,      vs40,   vs51
+    xvf64gerpp      3,      vs42,   vs51
+#else
+    xvf64gerpp      0,      vs40,   vs51 
+    xvf64gerpp      1,      vs42,   vs51
+    xvf64gerpp      2,      vs40,   vs50
+    xvf64gerpp      3,      vs42,   vs50
+#endif
 .if \IsLast==1
 	addi	AO, AO, DISP8(\Index,128)
 	addi	BO, BO, DISP4(\Index,64)
@@ -407,10 +515,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  
 
 .macro LOAD_END_2x4	OffsetA, OffsetB
-        xvf64gerpp      0,      vs32,   vs49
-        xvf64gerpp      1,      vs34,   vs49
-        xvf64gerpp      2,      vs32,   vs48
-        xvf64gerpp      3,      vs34,   vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp      0,      vs32,   vs48
+	xvf64gerpp      1,      vs34,   vs48
+	xvf64gerpp      2,      vs32,   vs49
+	xvf64gerpp      3,      vs34,   vs49
+#else
+	xvf64gerpp      0,      vs32,   vs49
+	xvf64gerpp      1,      vs34,   vs49
+	xvf64gerpp      2,      vs32,   vs48
+	xvf64gerpp      3,      vs34,   vs48
+#endif
 	addi	BO, BO, \OffsetB
 	addi	AO, AO, \OffsetA
 .endm
@@ -443,7 +558,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         xxpermdi vs45, vs12, vs13, 0b10
         xxpermdi vs46, vs14, vs15, 0b01
         xxpermdi vs47, vs14, vs15, 0b10
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxlor vs0, vs32, vs32
+	xxlor vs1, vs33, vs33
+	xxlor vs2, vs34, vs34
+	xxlor vs3, vs35, vs35
+	xxlor vs4, vs36, vs36 
+	xxlor vs5, vs37, vs37
+	xxlor vs6, vs38, vs38
+	xxlor vs7, vs39, vs39
+	xxlor vs8, vs40, vs40
+	xxlor vs9, vs41, vs41
+	xxlor vs10, vs42, vs42
+	xxlor vs11, vs43, vs43
+	xxlor vs12, vs44, vs44
+	xxlor vs13, vs45, vs45
+	xxlor vs14, vs46, vs46
+	xxlor vs15, vs47, vs47
+#else
 	xxlor vs2, vs32, vs32
 	xxlor vs3, vs33, vs33
 	xxlor vs0, vs34, vs34
@@ -460,7 +592,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxlor vs15, vs45, vs45
 	xxlor vs12, vs46, vs46
 	xxlor vs13, vs47, vs47
-
+#endif
 	SAVE4  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
 	SAVE4  vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0  
 	addi	CO, CO, 64
@@ -488,12 +620,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNEL2x2_2 Index, IsLast
 	lxvp	vs40,	DISP4(\Index, 32)(AO)	// load real,imag from A
  	lxvp	vs50,	DISP4(\Index, 32)(BO)	// load real,imag from B
-        xvf64gerpp      0,      vs32,   vs49
-        xvf64gerpp      1,      vs32,   vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp	0,	vs32,	vs48
+	xvf64gerpp	1,	vs32,	vs49
+#else
+	xvf64gerpp      0,      vs32,   vs49
+	xvf64gerpp      1,      vs32,   vs48
+#endif
 	lxvp	vs32,	DISP4(\Index, 64)(AO)	// load real,imag from A
 	lxvp	vs48,	DISP4(\Index, 64)(BO)	// load real imag from B
-        xvf64gerpp      0,      vs40,   vs51
-        xvf64gerpp      1,      vs40,   vs50
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp	0,	vs40,	vs50
+	xvf64gerpp	1,	vs40,	vs51
+#else
+	xvf64gerpp      0,      vs40,   vs51
+	xvf64gerpp      1,      vs40,   vs50
+#endif
 .if \IsLast==1
 	addi	AO, AO, DISP4(\Index,64)
 	addi	BO, BO, DISP4(\Index,64)
@@ -502,8 +644,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  
 .macro LOAD_END_2x2  OffsetA,OffsetB
-        xvf64gerpp      0,      vs32,   vs49
-        xvf64gerpp      1,      vs32,   vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp	0,	vs32,	vs48
+	xvf64gerpp	1,	vs32,	vs49
+#else
+	xvf64gerpp      0,      vs32,   vs49
+	xvf64gerpp      1,      vs32,   vs48
+#endif
 	addi	BO, BO, \OffsetB
 	addi	AO, AO, \OffsetA
 .endm
@@ -526,7 +673,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         xxpermdi vs37, vs4, vs5, 0b10
         xxpermdi vs38, vs6, vs7, 0b01
         xxpermdi vs39, vs6, vs7, 0b10
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxlor vs0, vs32, vs32
+	xxlor vs1, vs33, vs33
+	xxlor vs2, vs34, vs34
+	xxlor vs3, vs35, vs35
+	xxlor vs4, vs36, vs36
+	xxlor vs5, vs37, vs37
+	xxlor vs6, vs38, vs38
+	xxlor vs7, vs39, vs39
+#else
 	xxlor vs2, vs32, vs32
 	xxlor vs3, vs33, vs33
 	xxlor vs0, vs34, vs34
@@ -535,7 +691,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxlor vs7, vs37, vs37
 	xxlor vs4, vs38, vs38
 	xxlor vs5, vs39, vs39
-
+#endif
 	SAVE2  vs0,vs1,vs2,vs3,CO,0
 	SAVE2  vs4,vs5,vs6,vs7,T1,0 
 	addi	CO, CO, 32 
@@ -702,14 +858,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	lxvp	vs44,	DISP16(\Index, 192)(AO)	// load real,imag from A
 	lxvp	vs46,	DISP16(\Index, 224)(AO)	// load real,imag from A
 	lxvp	vs48,	DISP2(\Index,    0)(BO)	// load real imag from B
-        xvf64gerpp      0,      vs32,   vs49
-        xvf64gerpp      1,      vs34,   vs49
-        xvf64gerpp      2,      vs36,   vs49
-        xvf64gerpp      3,      vs38,   vs49
-        xvf64gerpp      0,      vs40,   vs48
-        xvf64gerpp      1,      vs42,   vs48
-        xvf64gerpp      2,      vs44,   vs48
-        xvf64gerpp      3,      vs46,   vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp	0,	vs32,	vs48
+	xvf64gerpp	1,	vs34,	vs48
+	xvf64gerpp	2,	vs36,	vs48
+	xvf64gerpp	3,	vs38,	vs48
+	xvf64gerpp	0,	vs40,	vs49
+	xvf64gerpp	1,	vs42,	vs49
+	xvf64gerpp	2,	vs44,	vs49
+	xvf64gerpp	3,	vs46,	vs49
+#else
+	xvf64gerpp      0,      vs32,   vs49
+	xvf64gerpp      1,      vs34,   vs49
+	xvf64gerpp      2,      vs36,   vs49
+	xvf64gerpp      3,      vs38,   vs49
+	xvf64gerpp      0,      vs40,   vs48
+	xvf64gerpp      1,      vs42,   vs48
+	xvf64gerpp      2,      vs44,   vs48
+	xvf64gerpp      3,      vs46,   vs48
+#endif
 .if \IsLast==1
 	addi	AO, AO, DISP16(\Index,256)
 	addi	BO, BO,  DISP2(\Index,32)
@@ -758,7 +925,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         xxpermdi vs45, vs12, vs13, 0b10
         xxpermdi vs46, vs14, vs15, 0b01
         xxpermdi vs47, vs14, vs15, 0b10
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxlor vs0, vs32, vs32
+	xxlor vs1, vs33, vs33
+	xxlor vs2, vs34, vs34
+	xxlor vs3, vs35, vs35
+	xxlor vs4, vs36, vs36
+	xxlor vs5, vs37, vs37
+	xxlor vs6, vs38, vs38
+	xxlor vs7, vs39, vs39
+	xxlor vs8, vs40, vs40
+	xxlor vs9, vs41, vs41
+	xxlor vs10, vs42, vs42
+	xxlor vs11, vs43, vs43
+	xxlor vs12, vs44, vs44
+	xxlor vs13, vs45, vs45
+	xxlor vs14, vs46, vs46
+	xxlor vs15, vs47, vs47
+#else
 	xxlor vs2, vs32, vs32
 	xxlor vs3, vs33, vs33
 	xxlor vs0, vs34, vs34
@@ -775,7 +959,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxlor vs15, vs45, vs45
 	xxlor vs12, vs46, vs46
 	xxlor vs13, vs47, vs47
-
+#endif
 	SAVE8  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
 	addi	CO, CO, 128
 .endm
@@ -799,10 +983,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	lxvp	vs40,	DISP8(\Index, 64)(AO)	// load real,imag from A
 	lxvp	vs42,	DISP8(\Index, 96)(AO)	// load real,imag from A
 	lxvp	vs48,	DISP2(\Index,  0)(BO)	// load real imag from B
-        xvf64gerpp      0,      vs32,   vs49
-        xvf64gerpp      1,      vs34,   vs49
-        xvf64gerpp      0,      vs40,   vs48
-        xvf64gerpp      1,      vs42,   vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp	0,	vs32,	vs48
+	xvf64gerpp	1,	vs34,	vs48
+	xvf64gerpp	0,	vs40,	vs49
+	xvf64gerpp	1,	vs42,	vs49
+#else
+	xvf64gerpp      0,      vs32,   vs49
+	xvf64gerpp      1,      vs34,   vs49
+	xvf64gerpp      0,      vs40,   vs48
+	xvf64gerpp      1,      vs42,   vs48
+#endif
 .if \IsLast==1
 	addi	AO, AO, DISP8(\Index,128)
 	addi	BO, BO,  DISP2(\Index,32)
@@ -837,7 +1028,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         xxpermdi vs37, vs4, vs5, 0b10
         xxpermdi vs38, vs6, vs7, 0b01
         xxpermdi vs39, vs6, vs7, 0b10
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxlor vs0, vs32, vs32
+	xxlor vs1, vs33, vs33
+	xxlor vs2, vs34, vs34
+	xxlor vs3, vs35, vs35
+	xxlor vs4, vs36, vs36
+	xxlor vs5, vs37, vs37
+	xxlor vs6, vs38, vs38
+	xxlor vs7, vs39, vs39
+#else
 	xxlor vs2, vs32, vs32
 	xxlor vs3, vs33, vs33
 	xxlor vs0, vs34, vs34
@@ -846,7 +1046,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxlor vs7, vs37, vs37
 	xxlor vs4, vs38, vs38
 	xxlor vs5, vs39, vs39
-
+#endif
 	SAVE4  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
 	addi	CO, CO, 64
 .endm
@@ -867,8 +1067,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	lxvp	vs32,	DISP4(\Index,  0)(AO)	// load real,imag from A
 	lxvp	vs40,	DISP4(\Index, 32)(AO)	// load real,imag from A
 	lxvp	vs48,	DISP2(\Index,  0)(BO)	// load real imag from B
-        xvf64gerpp      0,      vs32,   vs49
-        xvf64gerpp      0,      vs40,   vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp	0,	vs32,	vs48
+	xvf64gerpp	0,	vs40,	vs49
+#else
+	xvf64gerpp      0,      vs32,   vs49
+	xvf64gerpp      0,      vs40,   vs48
+#endif
 .if \IsLast==1
 	addi	AO, AO, DISP4(\Index,64)
 	addi	BO, BO, DISP2(\Index,32)
@@ -896,11 +1101,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         xxpermdi vs33, vs0, vs1, 0b10
         xxpermdi vs34, vs2, vs3, 0b01
         xxpermdi vs35, vs2, vs3, 0b10
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxlor vs0, vs32, vs32
+	xxlor vs1, vs33, vs33
+	xxlor vs2, vs34, vs34
+	xxlor vs3, vs35, vs35
+#else
 	xxlor vs2, vs32, vs32
 	xxlor vs3, vs33, vs33
 	xxlor vs0, vs34, vs34
 	xxlor vs1, vs35, vs35
+#endif
 
 	SAVE2  vs0,vs1,vs2,vs3,CO,0
 	addi	CO, CO, 32 
diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c
index d3bf60ca..e42eafab 100644
--- a/kernel/power/zgemv_t_4.c
+++ b/kernel/power/zgemv_t_4.c
@@ -43,7 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #elif HAVE_KERNEL_4x4_VEC
 
-#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#if defined(POWER10)
 typedef __vector unsigned char  vec_t;
 typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
 
diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c
index 59ddc149..0068138e 100644
--- a/kernel/power/zscal.c
+++ b/kernel/power/zscal.c
@@ -43,16 +43,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(DOUBLE)
 #include "zscal_microk_power8.c"
 #endif
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#elif defined(POWER10)
 #if defined(DOUBLE)
 #include "zscal_microk_power10.c"
 #else
 #include "cscal_microk_power10.c"
 #endif
-#elif defined(POWER10)
-#if defined(DOUBLE)
-#include "zscal_microk_power8.c"
-#endif
 #endif
 #endif
 
diff --git a/kernel/power/zscal_microk_power10.c b/kernel/power/zscal_microk_power10.c
index 15b8323f..af99b864 100644
--- a/kernel/power/zscal_microk_power10.c
+++ b/kernel/power/zscal_microk_power10.c
@@ -42,7 +42,11 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
 
        "xsnegdp		33, %x10	\n\t"	// -alpha_i
        XXSPLTD_S(32,%x9,0)	// alpha_r , alpha_r
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       XXMRGHD_S(33,33, %x10) // -alpha_i , alpha_i
+#else
        XXMRGHD_S(33,%x10, 33)	// -alpha_i , alpha_i
+#endif
 
        "lxvp		40, 0(%2)	\n\t"
        "lxvp		42, 32(%2)	\n\t"
@@ -97,10 +101,17 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
        "xvadddp		49, 49, 39	\n\t"
        "xvadddp		50, 50, %x3	\n\t"
        "xvadddp		51, 51, %x4	\n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "stxv        48, 0(%2)   \n\t"
+       "stxv        49, 16(%2)  \n\t"
+       "stxv        50, 32(%2)  \n\t"
+       "stxv        51, 48(%2)  \n\t"
+#else
        "stxv		49, 0(%2)	\n\t"
        "stxv		48, 16(%2)	\n\t"
        "stxv		51, 32(%2)	\n\t"
        "stxv		50, 48(%2)	\n\t"
+#endif
 
 
        "xvadddp		34, 34, %x5	\n\t"
@@ -109,12 +120,17 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
 
        "xvadddp		36, 36, %x7	\n\t"
        "xvadddp		37, 37, %x8	\n\t"
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "stxv        34, 64(%2)  \n\t"
+       "stxv        35, 80(%2)  \n\t"
+       "stxv        36, 96(%2)  \n\t"
+       "stxv        37, 112(%2) \n\t"
+#else
        "stxv		35, 64(%2)	\n\t"
        "stxv		34, 80(%2)	\n\t"
        "stxv		37, 96(%2)	\n\t"
        "stxv		36, 112(%2)	\n\t"
-
+#endif
        "addi		%2, %2, 128	\n\t"
 
        "addic.		%1, %1, -8	\n\t"
@@ -155,23 +171,34 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
 
        "xvadddp		50, 50, %x3	\n\t"
        "xvadddp		51, 51, %x4	\n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "stxv        48, 0(%2)   \n\t"
+       "stxv        49, 16(%2)  \n\t"
+       "stxv        50, 32(%2)  \n\t"
+       "stxv        51, 48(%2)  \n\t"
+#else
        "stxv		49, 0(%2)	\n\t"
        "stxv		48, 16(%2)	\n\t"
        "stxv		51, 32(%2)	\n\t"
        "stxv		50, 48(%2)	\n\t"
-
+#endif
        "xvadddp		34, 34, %x5	\n\t"
        "xvadddp		35, 35, %x6	\n\t"
 
 
        "xvadddp		36, 36, %x7	\n\t"
        "xvadddp		37, 37, %x8	\n\t"
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "stxv        34, 64(%2)  \n\t"
+       "stxv        35, 80(%2)  \n\t"
+       "stxv        36, 96(%2)  \n\t"
+       "stxv        37, 112(%2) \n\t"
+#else
        "stxv		35, 64(%2)	\n\t"
        "stxv		34, 80(%2)	\n\t"
        "stxv		37, 96(%2)	\n\t"
        "stxv		36, 112(%2)	\n\t"
-
+#endif
      "#n=%1 x=%0=%2 alpha=(%9,%10) \n"
      :
        "+m" (*x),
diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c
index 908802b7..fe787185 100644
--- a/kernel/power/zswap.c
+++ b/kernel/power/zswap.c
@@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__VEC__) || defined(__ALTIVEC__)
 #if defined(POWER8) || defined(POWER9)
 #include "zswap_microk_power8.c"
-#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
+#elif defined(POWER10) 
 #include "cswap_microk_power10.c"
-#elif defined(POWER10)
-#include "zswap_microk_power8.c"
 #endif
 #endif
 
diff --git a/param.h b/param.h
index 48770fa7..038233c1 100644
--- a/param.h
+++ b/param.h
@@ -2465,13 +2465,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define SGEMM_DEFAULT_UNROLL_M 16
 #define SGEMM_DEFAULT_UNROLL_N 8
-#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
-#define DGEMM_DEFAULT_UNROLL_M 16
-#define DGEMM_DEFAULT_UNROLL_N 4
-#else
 #define DGEMM_DEFAULT_UNROLL_M 8
 #define DGEMM_DEFAULT_UNROLL_N 8
-#endif
 #define CGEMM_DEFAULT_UNROLL_M 8
 #define CGEMM_DEFAULT_UNROLL_N 4
 #define ZGEMM_DEFAULT_UNROLL_M 8
-- 
2.31.1


From 9cc95e56579d865a3000c46e19c03455a3be3375 Mon Sep 17 00:00:00 2001
From: kavanabhat <Kavana.bhat@in.ibm.com>
Date: Fri, 1 Oct 2021 05:18:35 -0500
Subject: [PATCH 2/2] AIX changes for P10 with GNU Compiler

---
 Makefile.system             | 2 ++
 kernel/power/KERNEL.POWER10 | 6 ++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index 20db80d0..1ed79275 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -16,6 +16,8 @@ else
 HOSTARCH = $(ARCH)
 endif
 
+HAVE_GAS := $(shell as -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null)
+
 # Catch conflicting usage of ARCH in some BSD environments
 ifeq ($(ARCH), amd64)
 override ARCH=x86_64
diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index 50866c97..63816cb5 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -1,4 +1,6 @@
-
+ifeq ($(HAVE_GAS), 1)
+include $(KERNELDIR)/KERNEL.POWER8
+else
 #SGEMM_BETA = ../generic/gemm_beta.c
 #DGEMM_BETA = ../generic/gemm_beta.c
 #CGEMM_BETA = ../generic/zgemm_beta.c
@@ -216,4 +218,4 @@ QCABS_KERNEL	= ../generic/cabs.c
 #Dump kernel
 CGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
 ZGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
-
+endif
-- 
2.31.1

